From 76a099646bccf91aba8bee3116b36298cf39cd20 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Thu, 4 Jan 2024 00:20:21 +0000 Subject: [PATCH] =?UTF-8?q?Sync=20with=20whisper.cpp=20=F0=9F=9A=80?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/whisper/Package.swift | 10 +- src/whisper/bindings/javascript/package.json | 2 +- .../bindings/ruby/ext/ggml-backend-impl.h | 2 +- src/whisper/bindings/ruby/ext/ggml-backend.c | 4 +- src/whisper/bindings/ruby/ext/ggml-backend.h | 2 +- src/whisper/extra/sync-ggml-am.sh | 17 +- src/whisper/extra/sync-ggml.last | 2 +- src/whisper/ggml-backend-impl.h | 2 +- src/whisper/ggml-backend.c | 10 +- src/whisper/ggml-backend.h | 2 +- src/whisper/ggml-cuda.cu | 53 +- src/whisper/ggml-metal.h | 2 +- src/whisper/ggml-metal.m | 72 +- src/whisper/ggml-metal.metal | 709 ++++++++++++------ src/whisper/ggml-quants.c | 118 +-- src/whisper/ggml.c | 174 ++++- src/whisper/ggml.h | 1 + src/whisper/whisper.cpp | 24 +- 18 files changed, 822 insertions(+), 384 deletions(-) diff --git a/src/whisper/Package.swift b/src/whisper/Package.swift index bbb7fb0..a19dbf4 100644 --- a/src/whisper/Package.swift +++ b/src/whisper/Package.swift @@ -13,9 +13,13 @@ let package = Package( products: [ .library(name: "whisper", targets: ["whisper"]), ], + dependencies: [ + .package(url: "https://github.com/ggerganov/ggml.git", .branch("master")) + ], targets: [ .target( name: "whisper", + dependencies: ["ggml"], path: ".", exclude: [ "bindings", @@ -32,14 +36,8 @@ let package = Package( "Makefile" ], sources: [ - "ggml.c", "whisper.cpp", - "ggml-alloc.c", - "ggml-backend.c", - "ggml-quants.c", - "ggml-metal.m" ], - resources: [.process("ggml-metal.metal")], publicHeadersPath: "spm-headers", cSettings: [ .unsafeFlags(["-Wno-shorten-64-to-32", "-O3", "-DNDEBUG"]), diff --git a/src/whisper/bindings/javascript/package.json b/src/whisper/bindings/javascript/package.json index 2445896..5b629ac 100644 --- a/src/whisper/bindings/javascript/package.json +++ b/src/whisper/bindings/javascript/package.json @@ -1,6 +1,6 @@ { "name": "whisper.cpp", - "version": "1.5.2", + "version": "1.5.3", "description": "Whisper speech recognition", "main": "whisper.js", "scripts": { diff --git a/src/whisper/bindings/ruby/ext/ggml-backend-impl.h b/src/whisper/bindings/ruby/ext/ggml-backend-impl.h index 211e3d4..31788cd 100644 --- a/src/whisper/bindings/ruby/ext/ggml-backend-impl.h +++ b/src/whisper/bindings/ruby/ext/ggml-backend-impl.h @@ -70,7 +70,7 @@ extern "C" { void (*graph_plan_compute)(ggml_backend_t backend, ggml_backend_graph_plan_t plan); // compute graph without a plan - void (*graph_compute)(ggml_backend_t backend, struct ggml_cgraph * cgraph); + bool (*graph_compute)(ggml_backend_t backend, struct ggml_cgraph * cgraph); // check if the backend supports an operation bool (*supports_op)(ggml_backend_t backend, const struct ggml_tensor * op); diff --git a/src/whisper/bindings/ruby/ext/ggml-backend.c b/src/whisper/bindings/ruby/ext/ggml-backend.c index f6e5fce..128e33c 100644 --- a/src/whisper/bindings/ruby/ext/ggml-backend.c +++ b/src/whisper/bindings/ruby/ext/ggml-backend.c @@ -156,8 +156,8 @@ void ggml_backend_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_ backend->iface.graph_plan_compute(backend, plan); } -void ggml_backend_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) { - backend->iface.graph_compute(backend, cgraph); +bool ggml_backend_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) { + return backend->iface.graph_compute(backend, cgraph); } bool ggml_backend_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) { diff --git a/src/whisper/bindings/ruby/ext/ggml-backend.h b/src/whisper/bindings/ruby/ext/ggml-backend.h index 9666873..793a0a9 100644 --- a/src/whisper/bindings/ruby/ext/ggml-backend.h +++ b/src/whisper/bindings/ruby/ext/ggml-backend.h @@ -52,7 +52,7 @@ extern "C" { GGML_API void ggml_backend_graph_plan_free (ggml_backend_t backend, ggml_backend_graph_plan_t plan); GGML_API void ggml_backend_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan); - GGML_API void ggml_backend_graph_compute (ggml_backend_t backend, struct ggml_cgraph * cgraph); + GGML_API bool ggml_backend_graph_compute (ggml_backend_t backend, struct ggml_cgraph * cgraph); GGML_API bool ggml_backend_supports_op (ggml_backend_t backend, const struct ggml_tensor * op); // tensor copy between different backends diff --git a/src/whisper/extra/sync-ggml-am.sh b/src/whisper/extra/sync-ggml-am.sh index 0b8e55a..25f5372 100755 --- a/src/whisper/extra/sync-ggml-am.sh +++ b/src/whisper/extra/sync-ggml-am.sh @@ -27,7 +27,7 @@ echo "Syncing ggml changes since commit $lc" cd $SRC_GGML git log --oneline $lc..HEAD -git log --oneline $lc..HEAD | grep -v "(whisper/[0-9]*)" | cut -d' ' -f1 > $SRC_WHISPER/ggml-commits +git log --oneline $lc..HEAD --reverse | grep -v "(whisper/[0-9]*)" | cut -d' ' -f1 > $SRC_WHISPER/ggml-commits if [ ! -s $SRC_WHISPER/ggml-commits ]; then rm -v $SRC_WHISPER/ggml-commits @@ -48,11 +48,14 @@ while read c; do src/ggml*.m \ src/ggml*.metal \ src/ggml*.cu \ - tests/test-opt.cpp \ - tests/test-grad0.cpp \ - tests/test-quantize-fns.cpp \ - tests/test-quantize-perf.cpp \ - tests/test-backend-ops.cpp \ + examples/common.h \ + examples/common.cpp \ + examples/common-ggml.h \ + examples/common-ggml.cpp \ + examples/whisper/whisper.h \ + examples/whisper/whisper.cpp \ + examples/whisper/main.cpp \ + examples/whisper/quantize.cpp \ >> $SRC_WHISPER/ggml-src.patch done < $SRC_WHISPER/ggml-commits @@ -87,7 +90,6 @@ if [ -f $SRC_WHISPER/ggml-src.patch ]; then # src/ggml-impl.h -> ggml-impl.h # src/ggml-metal.h -> ggml-metal.h # src/ggml-metal.m -> ggml-metal.m - # src/ggml-metal.metal -> ggml-metal.metal # src/ggml-mpi.h -> ggml-mpi.h # src/ggml-mpi.c -> ggml-mpi.c # src/ggml-opencl.cpp -> ggml-opencl.cpp @@ -118,7 +120,6 @@ if [ -f $SRC_WHISPER/ggml-src.patch ]; then -e 's/src\/ggml-impl\.h/ggml-impl.h/g' \ -e 's/src\/ggml-metal\.h/ggml-metal.h/g' \ -e 's/src\/ggml-metal\.m/ggml-metal.m/g' \ - -e 's/src\/ggml-metal\.metal/ggml-metal.metal/g' \ -e 's/src\/ggml-mpi\.h/ggml-mpi.h/g' \ -e 's/src\/ggml-mpi\.c/ggml-mpi.c/g' \ -e 's/src\/ggml-opencl\.cpp/ggml-opencl.cpp/g' \ diff --git a/src/whisper/extra/sync-ggml.last b/src/whisper/extra/sync-ggml.last index 9166e0f..354246a 100644 --- a/src/whisper/extra/sync-ggml.last +++ b/src/whisper/extra/sync-ggml.last @@ -1 +1 @@ -965137f49917768959679a9e860dc414e170fd55 +3fd01e00e40583ccd4b393a7c6502d6a4455a1d5 diff --git a/src/whisper/ggml-backend-impl.h b/src/whisper/ggml-backend-impl.h index 0585993..ca21b47 100644 --- a/src/whisper/ggml-backend-impl.h +++ b/src/whisper/ggml-backend-impl.h @@ -90,7 +90,7 @@ extern "C" { void (*graph_plan_compute)(ggml_backend_t backend, ggml_backend_graph_plan_t plan); // compute graph without a plan - void (*graph_compute)(ggml_backend_t backend, struct ggml_cgraph * cgraph); + bool (*graph_compute)(ggml_backend_t backend, struct ggml_cgraph * cgraph); // check if the backend supports an operation bool (*supports_op)(ggml_backend_t backend, const struct ggml_tensor * op); diff --git a/src/whisper/ggml-backend.c b/src/whisper/ggml-backend.c index 2c37520..53e741c 100644 --- a/src/whisper/ggml-backend.c +++ b/src/whisper/ggml-backend.c @@ -195,11 +195,14 @@ void ggml_backend_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_ ggml_backend_synchronize(backend); } -void ggml_backend_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) { - backend->iface.graph_compute(backend, cgraph); +bool ggml_backend_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) { + if (!backend->iface.graph_compute(backend, cgraph)) { + return false; + } // TODO: optional sync ggml_backend_synchronize(backend); + return true; } bool ggml_backend_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) { @@ -597,7 +600,7 @@ static void ggml_backend_cpu_graph_plan_compute(ggml_backend_t backend, ggml_bac GGML_UNUSED(backend); } -static void ggml_backend_cpu_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) { +static bool ggml_backend_cpu_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) { struct ggml_backend_cpu_context * cpu_ctx = (struct ggml_backend_cpu_context *)backend->context; struct ggml_cplan cplan = ggml_graph_plan(cgraph, cpu_ctx->n_threads); @@ -611,6 +614,7 @@ static void ggml_backend_cpu_graph_compute(ggml_backend_t backend, struct ggml_c cplan.work_data = cpu_ctx->work_data; ggml_graph_compute(cgraph, &cplan); + return true; } static bool ggml_backend_cpu_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) { diff --git a/src/whisper/ggml-backend.h b/src/whisper/ggml-backend.h index a9d2fdd..85ff67b 100644 --- a/src/whisper/ggml-backend.h +++ b/src/whisper/ggml-backend.h @@ -58,7 +58,7 @@ extern "C" { GGML_API void ggml_backend_graph_plan_free (ggml_backend_t backend, ggml_backend_graph_plan_t plan); GGML_API void ggml_backend_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan); - GGML_API void ggml_backend_graph_compute (ggml_backend_t backend, struct ggml_cgraph * cgraph); + GGML_API bool ggml_backend_graph_compute (ggml_backend_t backend, struct ggml_cgraph * cgraph); GGML_API bool ggml_backend_supports_op (ggml_backend_t backend, const struct ggml_tensor * op); // tensor copy between different backends diff --git a/src/whisper/ggml-cuda.cu b/src/whisper/ggml-cuda.cu index 9a9effc..10c2161 100644 --- a/src/whisper/ggml-cuda.cu +++ b/src/whisper/ggml-cuda.cu @@ -119,7 +119,9 @@ #define MIN_CC_DP4A 610 // minimum compute capability for __dp4a, an intrinsic for byte-wise dot products #define CC_VOLTA 700 #define CC_OFFSET_AMD 1000000 +#define CC_RDNA1 (CC_OFFSET_AMD + 1010) #define CC_RDNA2 (CC_OFFSET_AMD + 1030) +#define CC_RDNA3 (CC_OFFSET_AMD + 1100) #define GGML_CUDA_MAX_NODES 8192 @@ -133,7 +135,6 @@ // TODO: improve this to be correct for more hardware // for example, currently fails for GeForce GTX 1660 which is TURING arch (> VOLTA) but does not have tensor cores -// probably other such cases, and not sure what happens on AMD hardware #if !defined(GGML_CUDA_FORCE_MMQ) #define CUDA_USE_TENSOR_CORES #endif @@ -6662,7 +6663,7 @@ static void ggml_cuda_pool_free_leg(int device, void * ptr, size_t size) { // pool with virtual memory static CUdeviceptr g_cuda_pool_addr[GGML_CUDA_MAX_DEVICES] = {0}; static size_t g_cuda_pool_used[GGML_CUDA_MAX_DEVICES] = {0}; -static const size_t CUDA_POOL_VMM_MAX_SIZE = 1ull << 36; // 64 GB +static const size_t CUDA_POOL_VMM_MAX_SIZE = 1ull << 35; // 32 GB static void * ggml_cuda_pool_malloc_vmm(int device, size_t size, size_t * actual_size) { scoped_spin_lock lock(g_cuda_pool_lock); @@ -8661,11 +8662,25 @@ static void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1 } } +#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) + + const bool fp16_performance_good = min_compute_capability >= CC_RDNA1; + bool use_mul_mat_q = ggml_is_quantized(src0->type); #ifdef CUDA_USE_TENSOR_CORES - const bool use_tensor_cores = true; + use_mul_mat_q = use_mul_mat_q && min_compute_capability < CC_RDNA3; +#endif // CUDA_USE_TENSOR_CORES + #else - const bool use_tensor_cores = false; -#endif + + const bool fp16_performance_good = min_compute_capability >= CC_VOLTA; + bool use_mul_mat_q = min_compute_capability >= MIN_CC_DP4A && ggml_is_quantized(src0->type); +#ifdef CUDA_USE_TENSOR_CORES + // when tensor cores are available, use them for large batch size + // ref: https://github.com/ggerganov/llama.cpp/pull/3776 + use_mul_mat_q = use_mul_mat_q && !(fp16_performance_good && src1->ne[1] > MMQ_MAX_BATCH_SIZE); +#endif // CUDA_USE_TENSOR_CORES + +#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) // debug helpers //printf("src0: %8d %8d %8d %8d\n", src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3]); @@ -8675,13 +8690,13 @@ static void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1 //printf("src0 is contiguous %d, transposed %d, type = %s, name = %s\n", ggml_is_contiguous(src0), ggml_is_transposed(src0), ggml_type_name(src0->type), src0->name); //printf("src1 is contiguous %d, transposed %d, type = %s, name = %s\n", ggml_is_contiguous(src1), ggml_is_transposed(src1), ggml_type_name(src1->type), src1->name); - if (!split && all_on_device && !use_tensor_cores && src0->type == GGML_TYPE_F16 && ggml_is_permuted(src0) && ggml_is_permuted(src1) && src1->ne[1] == 1) { + if (!split && all_on_device && !fp16_performance_good && src0->type == GGML_TYPE_F16 && ggml_is_permuted(src0) && ggml_is_permuted(src1) && src1->ne[1] == 1) { // KQ single-batch ggml_cuda_mul_mat_vec_p021(src0, src1, dst); - } else if (!split && all_on_device && !use_tensor_cores && src0->type == GGML_TYPE_F16 && !ggml_is_contiguous(src0) && !ggml_is_transposed(src1) && src1->ne[1] == 1) { + } else if (!split && all_on_device && !fp16_performance_good && src0->type == GGML_TYPE_F16 && !ggml_is_contiguous(src0) && !ggml_is_transposed(src1) && src1->ne[1] == 1) { // KQV single-batch ggml_cuda_mul_mat_vec_nc(src0, src1, dst); - } else if (!split && all_on_device && use_tensor_cores && src0->type == GGML_TYPE_F16 && !ggml_is_transposed(src0) && !ggml_is_transposed(src1)) { + } else if (!split && all_on_device && fp16_performance_good && src0->type == GGML_TYPE_F16 && !ggml_is_transposed(src0) && !ggml_is_transposed(src1)) { // KQ + KQV multi-batch ggml_cuda_mul_mat_mat_batched_cublas(src0, src1, dst); } else if (src0->type == GGML_TYPE_F32) { @@ -8701,14 +8716,6 @@ static void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1 ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_dequantize_mul_mat_vec, false); } } else { - bool use_mul_mat_q = min_compute_capability >= MIN_CC_DP4A && ggml_is_quantized(src0->type); - - // when tensor cores are available, use them for large batch size - // ref: https://github.com/ggerganov/llama.cpp/pull/3776 - if (use_tensor_cores && min_compute_capability >= CC_VOLTA && src1->ne[1] > MMQ_MAX_BATCH_SIZE) { - use_mul_mat_q = false; - } - if (use_mul_mat_q) { ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_mul_mat_q, true); } else { @@ -9903,7 +9910,7 @@ static void ggml_backend_cuda_graph_plan_compute(ggml_backend_t backend, ggml_ba UNUSED(plan); } -static void ggml_backend_cuda_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) { +static bool ggml_backend_cuda_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) { ggml_backend_context_cuda * cuda_ctx = (ggml_backend_context_cuda *)backend->context; ggml_cuda_set_main_device(cuda_ctx->device); @@ -9960,6 +9967,8 @@ static void ggml_backend_cuda_graph_compute(ggml_backend_t backend, ggml_cgraph } UNUSED(backend); + + return true; } static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, const ggml_tensor * op) { @@ -10032,14 +10041,19 @@ static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, const ggml_ten } return false; } break; + case GGML_OP_DUP: + case GGML_OP_REPEAT: + case GGML_OP_CONCAT: + { + ggml_type src0_type = op->src[0]->type; + return src0_type != GGML_TYPE_I32 && src0_type != GGML_TYPE_I16; + } break; case GGML_OP_NONE: case GGML_OP_RESHAPE: case GGML_OP_VIEW: case GGML_OP_PERMUTE: case GGML_OP_TRANSPOSE: case GGML_OP_NORM: - case GGML_OP_REPEAT: - case GGML_OP_DUP: case GGML_OP_ADD: case GGML_OP_MUL: case GGML_OP_DIV: @@ -10056,7 +10070,6 @@ static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, const ggml_ten case GGML_OP_SUM_ROWS: case GGML_OP_ARGSORT: case GGML_OP_ACC: - case GGML_OP_CONCAT: case GGML_OP_GROUP_NORM: case GGML_OP_UPSCALE: case GGML_OP_PAD: diff --git a/src/whisper/ggml-metal.h b/src/whisper/ggml-metal.h index b5e02b6..c4b7325 100644 --- a/src/whisper/ggml-metal.h +++ b/src/whisper/ggml-metal.h @@ -87,7 +87,7 @@ int * ggml_metal_get_concur_list(struct ggml_metal_context * ctx); // same as ggml_graph_compute but uses Metal // creates gf->n_threads command buffers in parallel -void ggml_metal_graph_compute(struct ggml_metal_context * ctx, struct ggml_cgraph * gf); +bool ggml_metal_graph_compute(struct ggml_metal_context * ctx, struct ggml_cgraph * gf); // // backend API diff --git a/src/whisper/ggml-metal.m b/src/whisper/ggml-metal.m index 51a72ae..55cc1a8 100644 --- a/src/whisper/ggml-metal.m +++ b/src/whisper/ggml-metal.m @@ -87,6 +87,7 @@ GGML_METAL_DECL_KERNEL(get_rows_q4_K); GGML_METAL_DECL_KERNEL(get_rows_q5_K); GGML_METAL_DECL_KERNEL(get_rows_q6_K); + GGML_METAL_DECL_KERNEL(get_rows_i32); GGML_METAL_DECL_KERNEL(rms_norm); GGML_METAL_DECL_KERNEL(group_norm); GGML_METAL_DECL_KERNEL(norm); @@ -257,13 +258,14 @@ static void ggml_metal_log(enum ggml_log_level level, const char * format, ...){ bundle = [NSBundle bundleForClass:[GGMLMetalClass class]]; #endif NSError * error = nil; - NSString * libPath = [bundle pathForResource:@"default" ofType:@"metallib"]; + NSString * libPath = [bundle pathForResource:@"ggml" ofType:@"metallib"]; if (libPath != nil) { + // pre-compiled library found NSURL * libURL = [NSURL fileURLWithPath:libPath]; GGML_METAL_LOG_INFO("%s: loading '%s'\n", __func__, [libPath UTF8String]); ctx->library = [ctx->device newLibraryWithURL:libURL error:&error]; } else { - GGML_METAL_LOG_INFO("%s: default.metallib not found, loading from source\n", __func__); + GGML_METAL_LOG_INFO("%s: ggml.metallib not found, loading from source\n", __func__); NSString * sourcePath; NSString * ggmlMetalPathResources = [[NSProcessInfo processInfo].environment objectForKey:@"GGML_METAL_PATH_RESOURCES"]; @@ -291,6 +293,13 @@ static void ggml_metal_log(enum ggml_log_level level, const char * format, ...){ options = [MTLCompileOptions new]; options.preprocessorMacros = @{ @"QK_K" : @(64) }; #endif + // try to disable fast-math + // NOTE: this seems to have no effect whatsoever + // instead, in order to disable fast-math, we have to build ggml.metallib from the command line + // using xcrun -sdk macosx metal -fno-fast-math -c ggml-metal.metal -o ggml-metal.air + // and go through the "pre-compiled library found" path above + //[options setFastMathEnabled:false]; + ctx->library = [ctx->device newLibraryWithSource:src options:options error:&error]; } @@ -369,6 +378,7 @@ static void ggml_metal_log(enum ggml_log_level level, const char * format, ...){ GGML_METAL_ADD_KERNEL(get_rows_q4_K); GGML_METAL_ADD_KERNEL(get_rows_q5_K); GGML_METAL_ADD_KERNEL(get_rows_q6_K); + GGML_METAL_ADD_KERNEL(get_rows_i32); GGML_METAL_ADD_KERNEL(rms_norm); GGML_METAL_ADD_KERNEL(group_norm); GGML_METAL_ADD_KERNEL(norm); @@ -491,6 +501,7 @@ void ggml_metal_free(struct ggml_metal_context * ctx) { GGML_METAL_DEL_KERNEL(get_rows_q4_K); GGML_METAL_DEL_KERNEL(get_rows_q5_K); GGML_METAL_DEL_KERNEL(get_rows_q6_K); + GGML_METAL_DEL_KERNEL(get_rows_i32); GGML_METAL_DEL_KERNEL(rms_norm); GGML_METAL_DEL_KERNEL(group_norm); GGML_METAL_DEL_KERNEL(norm); @@ -966,7 +977,7 @@ static bool ggml_metal_supports_op(const struct ggml_tensor * op) { return false; } } -void ggml_metal_graph_compute( +bool ggml_metal_graph_compute( struct ggml_metal_context * ctx, struct ggml_cgraph * gf) { @autoreleasepool { @@ -1230,7 +1241,7 @@ void ggml_metal_graph_compute( // not sure how to avoid this // TODO: make a simpler cpy_bytes kernel - const int nth = MIN(1024, ne00); + const int nth = MIN((int) ctx->pipeline_cpy_f32_f32.maxTotalThreadsPerThreadgroup, ne00); [encoder setComputePipelineState:ctx->pipeline_cpy_f32_f32]; [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; @@ -1285,7 +1296,7 @@ void ggml_metal_graph_compute( [encoder setBytes:&pnb3 length:sizeof(pnb3) atIndex:26]; [encoder setBytes:&offs length:sizeof(offs) atIndex:27]; - const int nth = MIN(1024, ne0); + const int nth = MIN((int) ctx->pipeline_add.maxTotalThreadsPerThreadgroup, ne00); [encoder dispatchThreadgroups:MTLSizeMake(ne11, ne12, ne13) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)]; } break; @@ -1649,6 +1660,10 @@ void ggml_metal_graph_compute( } }; + if (ggml_is_quantized(src0t)) { + GGML_ASSERT(ne00 >= nth0*nth1); + } + [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; [encoder setBuffer:id_src1 offset:offs_src1 atIndex:1]; [encoder setBuffer:id_dst offset:offs_dst atIndex:2]; @@ -1707,6 +1722,9 @@ void ggml_metal_graph_compute( // TODO: make this more general GGML_ASSERT(n_as <= 8); + // max size of the src1ids array in the kernel stack + GGML_ASSERT(ne11 <= 512); + struct ggml_tensor * src2 = gf->nodes[i]->src[2]; const int64_t ne20 = src2 ? src2->ne[0] : 0; @@ -1724,9 +1742,6 @@ void ggml_metal_graph_compute( GGML_ASSERT(!ggml_is_transposed(src2)); GGML_ASSERT(!ggml_is_transposed(src1)); - GGML_ASSERT(ne20 % 32 == 0); - // !!!!!!!!! TODO: this assert is probably required but not sure! - //GGML_ASSERT(ne20 >= 64); GGML_ASSERT(src1t == GGML_TYPE_F32); const uint r2 = ne12/ne22; @@ -1734,22 +1749,22 @@ void ggml_metal_graph_compute( // find the break-even point where the matrix-matrix kernel becomes more efficient compared // to the matrix-vector kernel - int ne11_mm_min = 1; + int ne11_mm_min = n_as; const int idx = ((int32_t *) dst->op_params)[0]; // batch size GGML_ASSERT(ne01 == ne11); - const int64_t _ne1 = 1; // kernel_mul_mm_impl needs a reference in constant memory - // for now the matrix-matrix multiplication kernel only works on A14+/M1+ SoCs // AMD GPU and older A-chips will reuse matrix-vector multiplication kernel // !!! // TODO: for now, always use mat-vec kernels until we figure out how to improve the // indirect matrix multiplication // !!! - if ([ctx->device supportsFamily:MTLGPUFamilyApple7] && _ne1 > ne11_mm_min) { + if ([ctx->device supportsFamily:MTLGPUFamilyApple7] && + ne20 % 32 == 0 && ne20 >= 64 && + ne11 > ne11_mm_min) { switch (src2->type) { case GGML_TYPE_F32: [encoder setComputePipelineState:ctx->pipeline_mul_mm_id_f32_f32]; break; case GGML_TYPE_F16: [encoder setComputePipelineState:ctx->pipeline_mul_mm_id_f16_f32]; break; @@ -1779,14 +1794,15 @@ void ggml_metal_graph_compute( [encoder setBytes:&nb11 length:sizeof(nb11) atIndex:11]; [encoder setBytes:&nb12 length:sizeof(nb12) atIndex:12]; [encoder setBytes:&ne0 length:sizeof(ne0) atIndex:13]; - [encoder setBytes:&_ne1 length:sizeof(_ne1) atIndex:14]; + [encoder setBytes:&ne1 length:sizeof(ne1) atIndex:14]; [encoder setBytes:&nb1 length:sizeof(nb1) atIndex:15]; [encoder setBytes:&r2 length:sizeof(r2) atIndex:16]; [encoder setBytes:&r3 length:sizeof(r3) atIndex:17]; [encoder setBytes:&idx length:sizeof(idx) atIndex:18]; // TODO: how to make this an array? read Metal docs - for (int j = 0; j < n_as; ++j) { - struct ggml_tensor * src_cur = dst->src[2 + j]; + for (int j = 0; j < 8; ++j) { + // NOTE: this is done like this to avoid uninitialized kernel arguments when n_as < 8 + struct ggml_tensor * src_cur = dst->src[2 + (j % n_as)]; size_t offs_src_cur = 0; id id_src_cur = ggml_metal_get_buffer(ctx, src_cur, &offs_src_cur); @@ -1796,8 +1812,7 @@ void ggml_metal_graph_compute( [encoder setThreadgroupMemoryLength:8192 atIndex:0]; - // TODO: processing one row at a time (ne11 -> 1) is not efficient - [encoder dispatchThreadgroups:MTLSizeMake( (_ne1 + 31)/32, (ne21 + 63)/64, ne01*ne12*ne13) threadsPerThreadgroup:MTLSizeMake(128, 1, 1)]; + [encoder dispatchThreadgroups:MTLSizeMake((ne11 + 31)/32, (ne21 + 63)/64, n_as*ne12*ne13) threadsPerThreadgroup:MTLSizeMake(128, 1, 1)]; } else { int nth0 = 32; int nth1 = 1; @@ -1880,11 +1895,17 @@ void ggml_metal_graph_compute( } break; default: { - GGML_METAL_LOG_ERROR("Asserting on type %d\n", (int)src0t); + GGML_METAL_LOG_ERROR("Asserting on type %d\n", (int)src2t); GGML_ASSERT(false && "not implemented"); } }; + if (ggml_is_quantized(src2t)) { + GGML_ASSERT(ne20 >= nth0*nth1); + } + + const int64_t _ne1 = 1; // kernels needs a reference in constant memory + [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; [encoder setBuffer:id_src1 offset:offs_src1 atIndex:1]; [encoder setBuffer:id_dst offset:offs_dst atIndex:2]; @@ -1909,8 +1930,9 @@ void ggml_metal_graph_compute( [encoder setBytes:&r3 length:sizeof(r3) atIndex:21]; [encoder setBytes:&idx length:sizeof(idx) atIndex:22]; // TODO: how to make this an array? read Metal docs - for (int j = 0; j < n_as; ++j) { - struct ggml_tensor * src_cur = dst->src[2 + j]; + for (int j = 0; j < 8; ++j) { + // NOTE: this is done like this to avoid uninitialized kernel arguments when n_as < 8 + struct ggml_tensor * src_cur = dst->src[2 + (j % n_as)]; size_t offs_src_cur = 0; id id_src_cur = ggml_metal_get_buffer(ctx, src_cur, &offs_src_cur); @@ -1959,6 +1981,7 @@ void ggml_metal_graph_compute( case GGML_TYPE_Q4_K: [encoder setComputePipelineState:ctx->pipeline_get_rows_q4_K]; break; case GGML_TYPE_Q5_K: [encoder setComputePipelineState:ctx->pipeline_get_rows_q5_K]; break; case GGML_TYPE_Q6_K: [encoder setComputePipelineState:ctx->pipeline_get_rows_q6_K]; break; + case GGML_TYPE_I32: [encoder setComputePipelineState:ctx->pipeline_get_rows_i32]; break; default: GGML_ASSERT(false && "not implemented"); } @@ -2229,7 +2252,7 @@ void ggml_metal_graph_compute( [encoder setBytes:&nb3 length:sizeof(nb3) atIndex:17]; [encoder setBytes:&sf length:sizeof(sf) atIndex:18]; - const int nth = MIN(1024, ne0); + const int nth = MIN((int) ctx->pipeline_upscale_f32.maxTotalThreadsPerThreadgroup, ne0); [encoder dispatchThreadgroups:MTLSizeMake(ne1, ne2, ne3) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)]; } break; @@ -2382,10 +2405,11 @@ void ggml_metal_graph_compute( MTLCommandBufferStatus status = (MTLCommandBufferStatus) [ctx->command_buffers[i] status]; if (status != MTLCommandBufferStatusCompleted) { GGML_METAL_LOG_INFO("%s: command buffer %d failed with status %lu\n", __func__, i, status); - GGML_ASSERT(false); + return false; } } + return true; } } @@ -2665,10 +2689,10 @@ static ggml_backend_buffer_type_t ggml_backend_metal_get_default_buffer_type(ggm UNUSED(backend); } -static void ggml_backend_metal_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) { +static bool ggml_backend_metal_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) { struct ggml_metal_context * metal_ctx = (struct ggml_metal_context *)backend->context; - ggml_metal_graph_compute(metal_ctx, cgraph); + return ggml_metal_graph_compute(metal_ctx, cgraph); } static bool ggml_backend_metal_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) { diff --git a/src/whisper/ggml-metal.metal b/src/whisper/ggml-metal.metal index d5b54e1..a7d3f9e 100644 --- a/src/whisper/ggml-metal.metal +++ b/src/whisper/ggml-metal.metal @@ -59,26 +59,26 @@ kernel void kernel_add( constant int64_t & ne01, constant int64_t & ne02, constant int64_t & ne03, - constant int64_t & nb00, - constant int64_t & nb01, - constant int64_t & nb02, - constant int64_t & nb03, + constant uint64_t & nb00, + constant uint64_t & nb01, + constant uint64_t & nb02, + constant uint64_t & nb03, constant int64_t & ne10, constant int64_t & ne11, constant int64_t & ne12, constant int64_t & ne13, - constant int64_t & nb10, - constant int64_t & nb11, - constant int64_t & nb12, - constant int64_t & nb13, + constant uint64_t & nb10, + constant uint64_t & nb11, + constant uint64_t & nb12, + constant uint64_t & nb13, constant int64_t & ne0, constant int64_t & ne1, constant int64_t & ne2, constant int64_t & ne3, - constant int64_t & nb0, - constant int64_t & nb1, - constant int64_t & nb2, - constant int64_t & nb3, + constant uint64_t & nb0, + constant uint64_t & nb1, + constant uint64_t & nb2, + constant uint64_t & nb3, constant int64_t & offs, uint3 tgpig[[threadgroup_position_in_grid]], uint3 tpitg[[thread_position_in_threadgroup]], @@ -109,26 +109,26 @@ kernel void kernel_mul( constant int64_t & ne01, constant int64_t & ne02, constant int64_t & ne03, - constant int64_t & nb00, - constant int64_t & nb01, - constant int64_t & nb02, - constant int64_t & nb03, + constant uint64_t & nb00, + constant uint64_t & nb01, + constant uint64_t & nb02, + constant uint64_t & nb03, constant int64_t & ne10, constant int64_t & ne11, constant int64_t & ne12, constant int64_t & ne13, - constant int64_t & nb10, - constant int64_t & nb11, - constant int64_t & nb12, - constant int64_t & nb13, + constant uint64_t & nb10, + constant uint64_t & nb11, + constant uint64_t & nb12, + constant uint64_t & nb13, constant int64_t & ne0, constant int64_t & ne1, constant int64_t & ne2, constant int64_t & ne3, - constant int64_t & nb0, - constant int64_t & nb1, - constant int64_t & nb2, - constant int64_t & nb3, + constant uint64_t & nb0, + constant uint64_t & nb1, + constant uint64_t & nb2, + constant uint64_t & nb3, uint3 tgpig[[threadgroup_position_in_grid]], uint3 tpitg[[thread_position_in_threadgroup]], uint3 ntg[[threads_per_threadgroup]]) { @@ -158,26 +158,26 @@ kernel void kernel_div( constant int64_t & ne01, constant int64_t & ne02, constant int64_t & ne03, - constant int64_t & nb00, - constant int64_t & nb01, - constant int64_t & nb02, - constant int64_t & nb03, + constant uint64_t & nb00, + constant uint64_t & nb01, + constant uint64_t & nb02, + constant uint64_t & nb03, constant int64_t & ne10, constant int64_t & ne11, constant int64_t & ne12, constant int64_t & ne13, - constant int64_t & nb10, - constant int64_t & nb11, - constant int64_t & nb12, - constant int64_t & nb13, + constant uint64_t & nb10, + constant uint64_t & nb11, + constant uint64_t & nb12, + constant uint64_t & nb13, constant int64_t & ne0, constant int64_t & ne1, constant int64_t & ne2, constant int64_t & ne3, - constant int64_t & nb0, - constant int64_t & nb1, - constant int64_t & nb2, - constant int64_t & nb3, + constant uint64_t & nb0, + constant uint64_t & nb1, + constant uint64_t & nb2, + constant uint64_t & nb3, uint3 tgpig[[threadgroup_position_in_grid]], uint3 tpitg[[thread_position_in_threadgroup]], uint3 ntg[[threads_per_threadgroup]]) { @@ -205,7 +205,7 @@ kernel void kernel_add_row( device const float4 * src0, device const float4 * src1, device float4 * dst, - constant int64_t & nb [[buffer(28)]], + constant uint64_t & nb [[buffer(28)]], uint tpig[[thread_position_in_grid]]) { dst[tpig] = src0[tpig] + src1[tpig % nb]; } @@ -214,7 +214,7 @@ kernel void kernel_mul_row( device const float4 * src0, device const float4 * src1, device float4 * dst, - constant int64_t & nb [[buffer(28)]], + constant uint64_t & nb [[buffer(28)]], uint tpig[[thread_position_in_grid]]) { dst[tpig] = src0[tpig] * src1[tpig % nb]; } @@ -223,7 +223,7 @@ kernel void kernel_div_row( device const float4 * src0, device const float4 * src1, device float4 * dst, - constant int64_t & nb [[buffer(28)]], + constant uint64_t & nb [[buffer(28)]], uint tpig[[thread_position_in_grid]]) { dst[tpig] = src0[tpig] / src1[tpig % nb]; } @@ -307,26 +307,26 @@ kernel void kernel_sum_rows( constant int64_t & ne01, constant int64_t & ne02, constant int64_t & ne03, - constant int64_t & nb00, - constant int64_t & nb01, - constant int64_t & nb02, - constant int64_t & nb03, + constant uint64_t & nb00, + constant uint64_t & nb01, + constant uint64_t & nb02, + constant uint64_t & nb03, constant int64_t & ne10, constant int64_t & ne11, constant int64_t & ne12, constant int64_t & ne13, - constant int64_t & nb10, - constant int64_t & nb11, - constant int64_t & nb12, - constant int64_t & nb13, + constant uint64_t & nb10, + constant uint64_t & nb11, + constant uint64_t & nb12, + constant uint64_t & nb13, constant int64_t & ne0, constant int64_t & ne1, constant int64_t & ne2, constant int64_t & ne3, - constant int64_t & nb0, - constant int64_t & nb1, - constant int64_t & nb2, - constant int64_t & nb3, + constant uint64_t & nb0, + constant uint64_t & nb1, + constant uint64_t & nb2, + constant uint64_t & nb3, uint3 tpig[[thread_position_in_grid]]) { int64_t i3 = tpig.z; int64_t i2 = tpig.y; @@ -846,7 +846,7 @@ inline float block_q_n_dot_y(device const block_q5_1 * qb_curr, float sumy, thre #define N_SIMDGROUP 2 // number of SIMD groups in a thread group //Note: This is a template, but strictly speaking it only applies to // quantizations where the block size is 32. It also does not -// giard against the number of rows not being divisible by +// guard against the number of rows not being divisible by // N_DST, so this is another explicit assumption of the implementation. template void mul_vec_q_n_f32_impl( @@ -920,14 +920,21 @@ kernel void kernel_mul_mv_q4_0_f32( device const float * src1, device float * dst, constant int64_t & ne00, - constant int64_t & ne01[[buffer(4)]], - constant int64_t & ne02[[buffer(5)]], - constant int64_t & ne10[[buffer(9)]], - constant int64_t & ne12[[buffer(11)]], - constant int64_t & ne0 [[buffer(15)]], - constant int64_t & ne1 [[buffer(16)]], - constant uint & r2 [[buffer(17)]], - constant uint & r3 [[buffer(18)]], + constant int64_t & ne01, + constant int64_t & ne02, + constant uint64_t & nb00, + constant uint64_t & nb01, + constant uint64_t & nb02, + constant int64_t & ne10, + constant int64_t & ne11, + constant int64_t & ne12, + constant uint64_t & nb10, + constant uint64_t & nb11, + constant uint64_t & nb12, + constant int64_t & ne0, + constant int64_t & ne1, + constant uint & r2, + constant uint & r3, uint3 tgpig[[threadgroup_position_in_grid]], uint tiisg[[thread_index_in_simdgroup]], uint sgitg[[simdgroup_index_in_threadgroup]]) { @@ -939,14 +946,21 @@ kernel void kernel_mul_mv_q4_1_f32( device const float * src1, device float * dst, constant int64_t & ne00, - constant int64_t & ne01[[buffer(4)]], - constant int64_t & ne02[[buffer(5)]], - constant int64_t & ne10[[buffer(9)]], - constant int64_t & ne12[[buffer(11)]], - constant int64_t & ne0 [[buffer(15)]], - constant int64_t & ne1 [[buffer(16)]], - constant uint & r2 [[buffer(17)]], - constant uint & r3 [[buffer(18)]], + constant int64_t & ne01, + constant int64_t & ne02, + constant uint64_t & nb00, + constant uint64_t & nb01, + constant uint64_t & nb02, + constant int64_t & ne10, + constant int64_t & ne11, + constant int64_t & ne12, + constant uint64_t & nb10, + constant uint64_t & nb11, + constant uint64_t & nb12, + constant int64_t & ne0, + constant int64_t & ne1, + constant uint & r2, + constant uint & r3, uint3 tgpig[[threadgroup_position_in_grid]], uint tiisg[[thread_index_in_simdgroup]], uint sgitg[[simdgroup_index_in_threadgroup]]) { @@ -958,14 +972,21 @@ kernel void kernel_mul_mv_q5_0_f32( device const float * src1, device float * dst, constant int64_t & ne00, - constant int64_t & ne01[[buffer(4)]], - constant int64_t & ne02[[buffer(5)]], - constant int64_t & ne10[[buffer(9)]], - constant int64_t & ne12[[buffer(11)]], - constant int64_t & ne0 [[buffer(15)]], - constant int64_t & ne1 [[buffer(16)]], - constant uint & r2 [[buffer(17)]], - constant uint & r3 [[buffer(18)]], + constant int64_t & ne01, + constant int64_t & ne02, + constant uint64_t & nb00, + constant uint64_t & nb01, + constant uint64_t & nb02, + constant int64_t & ne10, + constant int64_t & ne11, + constant int64_t & ne12, + constant uint64_t & nb10, + constant uint64_t & nb11, + constant uint64_t & nb12, + constant int64_t & ne0, + constant int64_t & ne1, + constant uint & r2, + constant uint & r3, uint3 tgpig[[threadgroup_position_in_grid]], uint tiisg[[thread_index_in_simdgroup]], uint sgitg[[simdgroup_index_in_threadgroup]]) { @@ -977,14 +998,21 @@ kernel void kernel_mul_mv_q5_1_f32( device const float * src1, device float * dst, constant int64_t & ne00, - constant int64_t & ne01[[buffer(4)]], - constant int64_t & ne02[[buffer(5)]], - constant int64_t & ne10[[buffer(9)]], - constant int64_t & ne12[[buffer(11)]], - constant int64_t & ne0 [[buffer(15)]], - constant int64_t & ne1 [[buffer(16)]], - constant uint & r2 [[buffer(17)]], - constant uint & r3 [[buffer(18)]], + constant int64_t & ne01, + constant int64_t & ne02, + constant uint64_t & nb00, + constant uint64_t & nb01, + constant uint64_t & nb02, + constant int64_t & ne10, + constant int64_t & ne11, + constant int64_t & ne12, + constant uint64_t & nb10, + constant uint64_t & nb11, + constant uint64_t & nb12, + constant int64_t & ne0, + constant int64_t & ne1, + constant uint & r2, + constant uint & r3, uint3 tgpig[[threadgroup_position_in_grid]], uint tiisg[[thread_index_in_simdgroup]], uint sgitg[[simdgroup_index_in_threadgroup]]) { @@ -1071,12 +1099,19 @@ kernel void kernel_mul_mv_q8_0_f32( constant int64_t & ne00, constant int64_t & ne01, constant int64_t & ne02, + constant uint64_t & nb00, + constant uint64_t & nb01, + constant uint64_t & nb02, constant int64_t & ne10, + constant int64_t & ne11, constant int64_t & ne12, + constant uint64_t & nb10, + constant uint64_t & nb11, + constant uint64_t & nb12, constant int64_t & ne0, constant int64_t & ne1, - constant uint & r2 [[buffer(17)]], - constant uint & r3 [[buffer(18)]], + constant uint & r2, + constant uint & r3, uint3 tgpig[[threadgroup_position_in_grid]], uint tiisg[[thread_index_in_simdgroup]], uint sgitg[[simdgroup_index_in_threadgroup]]) { @@ -1182,8 +1217,8 @@ kernel void kernel_mul_mv_f32_f32( constant uint64_t & nb12, constant int64_t & ne0, constant int64_t & ne1, - constant uint & r2 [[buffer(17)]], - constant uint & r3 [[buffer(18)]], + constant uint & r2, + constant uint & r3, uint3 tgpig[[threadgroup_position_in_grid]], uint tiisg[[thread_index_in_simdgroup]]) { kernel_mul_mv_f32_f32_impl(src0, src1, dst, ne00, ne01, ne02, nb00, nb01, nb02, ne10, ne11, ne12, nb10, nb11, nb12, ne0, ne1, r2, r3, tgpig, tiisg); @@ -1209,8 +1244,8 @@ kernel void kernel_mul_mv_f16_f16( constant uint64_t & nb12, constant int64_t & ne0, constant int64_t & ne1, - constant uint & r2 [[buffer(17)]], - constant uint & r3 [[buffer(18)]], + constant uint & r2, + constant uint & r3, uint3 tgpig[[threadgroup_position_in_grid]], uint tiisg[[thread_index_in_simdgroup]]) { @@ -1346,8 +1381,8 @@ kernel void kernel_mul_mv_f16_f32_1row( constant uint64_t & nb12, constant int64_t & ne0, constant int64_t & ne1, - constant uint & r2 [[buffer(17)]], - constant uint & r3 [[buffer(18)]], + constant uint & r2, + constant uint & r3, uint3 tgpig[[threadgroup_position_in_grid]], uint tiisg[[thread_index_in_simdgroup]]) { kernel_mul_mv_f16_f32_1row_impl(src0, src1, dst, ne00, ne01, ne02, nb00, nb01, nb02, ne10, ne11, ne12, nb10, nb11, nb12, ne0, ne1, r2, r3, tgpig, tiisg); @@ -1452,8 +1487,8 @@ kernel void kernel_mul_mv_f16_f32( constant uint64_t & nb12, constant int64_t & ne0, constant int64_t & ne1, - constant uint & r2 [[buffer(17)]], - constant uint & r3 [[buffer(18)]], + constant uint & r2, + constant uint & r3, uint3 tgpig[[threadgroup_position_in_grid]], uint tiisg[[thread_index_in_simdgroup]]) { kernel_mul_mv_f16_f32_impl(src0, src1, dst, ne00, ne01, ne02, nb00, nb01, nb02, ne10, ne11, ne12, nb10, nb11, nb12, ne0, ne1, r2, r3, tgpig, tiisg); @@ -1478,8 +1513,8 @@ kernel void kernel_mul_mv_f16_f32_l4( constant uint64_t & nb12, constant int64_t & ne0, constant int64_t & ne1, - constant uint & r2 [[buffer(17)]], - constant uint & r3 [[buffer(18)]], + constant uint & r2, + constant uint & r3, uint3 tgpig[[threadgroup_position_in_grid]], uint tiisg[[thread_index_in_simdgroup]]) { @@ -1543,7 +1578,8 @@ kernel void kernel_alibi_f32( const int64_t i3 = n / (ne2*ne1*ne0); const int64_t i2 = (n - i3*ne2*ne1*ne0) / (ne1*ne0); const int64_t i1 = (n - i3*ne2*ne1*ne0 - i2*ne1*ne0) / ne0; - const int64_t i0 = (n - i3*ne2*ne1*ne0 - i2*ne1*ne0 - i1*ne0); + //const int64_t i0 = (n - i3*ne2*ne1*ne0 - i2*ne1*ne0 - i1*ne0); + const int64_t k = i3*ne3 + i2; float m_k; @@ -2410,22 +2446,6 @@ typedef struct { } block_q6_K; // 210 bytes / block -static inline uchar4 get_scale_min_k4(int j, device const uint8_t * q) { - uchar4 r; - if (j < 4) { - r[0] = q[j+0] & 63; - r[2] = q[j+1] & 63; - r[1] = q[j+4] & 63; - r[3] = q[j+5] & 63; - } else { - r[0] = (q[j+4] & 0xF) | ((q[j-4] >> 6) << 4); - r[2] = (q[j+5] & 0xF) | ((q[j-3] >> 6) << 4); - r[1] = (q[j+4] >> 4) | ((q[j-0] >> 6) << 4); - r[3] = (q[j+5] >> 4) | ((q[j+1] >> 6) << 4); - } - return r; -} - //====================================== dot products ========================= void kernel_mul_mv_q2_K_f32_impl( @@ -2584,14 +2604,21 @@ kernel void kernel_mul_mv_q2_K_f32( device const float * src1, device float * dst, constant int64_t & ne00, - constant int64_t & ne01[[buffer(4)]], - constant int64_t & ne02[[buffer(5)]], - constant int64_t & ne10[[buffer(9)]], - constant int64_t & ne12[[buffer(11)]], - constant int64_t & ne0 [[buffer(15)]], - constant int64_t & ne1 [[buffer(16)]], - constant uint & r2 [[buffer(17)]], - constant uint & r3 [[buffer(18)]], + constant int64_t & ne01, + constant int64_t & ne02, + constant uint64_t & nb00, + constant uint64_t & nb01, + constant uint64_t & nb02, + constant int64_t & ne10, + constant int64_t & ne11, + constant int64_t & ne12, + constant uint64_t & nb10, + constant uint64_t & nb11, + constant uint64_t & nb12, + constant int64_t & ne0, + constant int64_t & ne1, + constant uint & r2, + constant uint & r3, uint3 tgpig[[threadgroup_position_in_grid]], uint tiisg[[thread_index_in_simdgroup]], uint sgitg[[simdgroup_index_in_threadgroup]]) { @@ -2841,14 +2868,21 @@ kernel void kernel_mul_mv_q3_K_f32( device const float * src1, device float * dst, constant int64_t & ne00, - constant int64_t & ne01[[buffer(4)]], - constant int64_t & ne02[[buffer(5)]], - constant int64_t & ne10[[buffer(9)]], - constant int64_t & ne12[[buffer(11)]], - constant int64_t & ne0 [[buffer(15)]], - constant int64_t & ne1 [[buffer(16)]], - constant uint & r2 [[buffer(17)]], - constant uint & r3 [[buffer(18)]], + constant int64_t & ne01, + constant int64_t & ne02, + constant uint64_t & nb00, + constant uint64_t & nb01, + constant uint64_t & nb02, + constant int64_t & ne10, + constant int64_t & ne11, + constant int64_t & ne12, + constant uint64_t & nb10, + constant uint64_t & nb11, + constant uint64_t & nb12, + constant int64_t & ne0, + constant int64_t & ne1, + constant uint & r2, + constant uint & r3, uint3 tgpig[[threadgroup_position_in_grid]], uint tiisg[[thread_index_in_simdgroup]], uint sgitg[[simdgroup_index_in_threadgroup]]) { @@ -2984,8 +3018,8 @@ void kernel_mul_mv_q4_K_f32_impl( constant uint & r2, constant uint & r3, uint3 tgpig[[threadgroup_position_in_grid]], - uint tiisg[[thread_index_in_simdgroup]], - uint sgitg[[simdgroup_index_in_threadgroup]]) { + uint tiisg[[thread_index_in_simdgroup]], + uint sgitg[[simdgroup_index_in_threadgroup]]) { const int ix = tiisg/4; // 0...7 const int it = tiisg%4; // 0...3 @@ -2994,7 +3028,7 @@ void kernel_mul_mv_q4_K_f32_impl( const int r0 = tgpig.x; const int r1 = tgpig.y; const int im = tgpig.z; - const int first_row = (r0 * N_SIMDGROUP + sgitg) * N_DST; + const int first_row = r0 * N_DST; const int ib_row = first_row * nb; const uint i12 = im%ne12; @@ -3060,7 +3094,7 @@ void kernel_mul_mv_q4_K_f32_impl( for (int row = 0; row < N_DST; ++row) { all_sum = simd_sum(sumf[row]); if (tiisg == 0) { - dst[r1*ne0+ im*ne0*ne1 + first_row + row] = all_sum; + dst[r1*ne0 + im*ne0*ne1 + first_row + row] = all_sum; } } } @@ -3072,14 +3106,21 @@ kernel void kernel_mul_mv_q4_K_f32( device const float * src1, device float * dst, constant int64_t & ne00, - constant int64_t & ne01[[buffer(4)]], - constant int64_t & ne02[[buffer(5)]], - constant int64_t & ne10[[buffer(9)]], - constant int64_t & ne12[[buffer(11)]], - constant int64_t & ne0 [[buffer(15)]], - constant int64_t & ne1 [[buffer(16)]], - constant uint & r2 [[buffer(17)]], - constant uint & r3 [[buffer(18)]], + constant int64_t & ne01, + constant int64_t & ne02, + constant uint64_t & nb00, + constant uint64_t & nb01, + constant uint64_t & nb02, + constant int64_t & ne10, + constant int64_t & ne11, + constant int64_t & ne12, + constant uint64_t & nb10, + constant uint64_t & nb11, + constant uint64_t & nb12, + constant int64_t & ne0, + constant int64_t & ne1, + constant uint & r2, + constant uint & r3, uint3 tgpig[[threadgroup_position_in_grid]], uint tiisg[[thread_index_in_simdgroup]], uint sgitg[[simdgroup_index_in_threadgroup]]) { @@ -3271,14 +3312,21 @@ kernel void kernel_mul_mv_q5_K_f32( device const float * src1, device float * dst, constant int64_t & ne00, - constant int64_t & ne01[[buffer(4)]], - constant int64_t & ne02[[buffer(5)]], - constant int64_t & ne10[[buffer(9)]], - constant int64_t & ne12[[buffer(11)]], - constant int64_t & ne0 [[buffer(15)]], - constant int64_t & ne1 [[buffer(16)]], - constant uint & r2 [[buffer(17)]], - constant uint & r3 [[buffer(18)]], + constant int64_t & ne01, + constant int64_t & ne02, + constant uint64_t & nb00, + constant uint64_t & nb01, + constant uint64_t & nb02, + constant int64_t & ne10, + constant int64_t & ne11, + constant int64_t & ne12, + constant uint64_t & nb10, + constant uint64_t & nb11, + constant uint64_t & nb12, + constant int64_t & ne0, + constant int64_t & ne1, + constant uint & r2, + constant uint & r3, uint3 tgpig[[threadgroup_position_in_grid]], uint tiisg[[thread_index_in_simdgroup]], uint sgitg[[simdgroup_index_in_threadgroup]]) { @@ -3398,14 +3446,21 @@ kernel void kernel_mul_mv_q6_K_f32( device const float * src1, device float * dst, constant int64_t & ne00, - constant int64_t & ne01[[buffer(4)]], - constant int64_t & ne02[[buffer(5)]], - constant int64_t & ne10[[buffer(9)]], - constant int64_t & ne12[[buffer(11)]], - constant int64_t & ne0 [[buffer(15)]], - constant int64_t & ne1 [[buffer(16)]], - constant uint & r2 [[buffer(17)]], - constant uint & r3 [[buffer(18)]], + constant int64_t & ne01, + constant int64_t & ne02, + constant uint64_t & nb00, + constant uint64_t & nb01, + constant uint64_t & nb02, + constant int64_t & ne10, + constant int64_t & ne11, + constant int64_t & ne12, + constant uint64_t & nb10, + constant uint64_t & nb11, + constant uint64_t & nb12, + constant int64_t & ne0, + constant int64_t & ne1, + constant uint & r2, + constant uint & r3, uint3 tgpig[[threadgroup_position_in_grid]], uint tiisg[[thread_index_in_simdgroup]], uint sgitg[[simdgroup_index_in_threadgroup]]) { @@ -3523,7 +3578,7 @@ void dequantize_q8_0(device const block_q8_0 *xb, short il, thread type4x4 & reg device const int8_t * qs = ((device const int8_t *)xb->qs); const half d = xb->d; - for (int i=0;i<16;i++) { + for (int i = 0; i < 16; i++) { reg[i/4][i%4] = (qs[i + 16*il] * d); } } @@ -3774,6 +3829,35 @@ kernel void kernel_get_rows_f16( } } +kernel void kernel_get_rows_i32( + device const void * src0, + device const char * src1, + device int32_t * dst, + constant int64_t & ne00, + constant uint64_t & nb01, + constant uint64_t & nb02, + constant int64_t & ne10, + constant uint64_t & nb10, + constant uint64_t & nb11, + constant uint64_t & nb1, + constant uint64_t & nb2, + uint3 tgpig[[threadgroup_position_in_grid]], + uint tiitg[[thread_index_in_threadgroup]], + uint3 tptg [[threads_per_threadgroup]]) { + const int64_t i10 = tgpig.x; + const int64_t i11 = tgpig.y; + + const int64_t r = ((device int32_t *) ((device char *) src1 + i11*nb11 + i10*nb10))[0]; + + const int64_t i02 = i11; + + for (int ind = tiitg; ind < ne00; ind += tptg.x) { + ((device int32_t *) ((device char *) dst + i11*nb2 + i10*nb1))[ind] = + ((device int32_t *) ((device char *) src0 + r*nb01 + i02*nb02))[ind]; + } +} + + #define BLOCK_SIZE_M 64 // 8 simdgroup matrices from matrix A #define BLOCK_SIZE_N 32 // 4 simdgroup matrices from matrix B #define BLOCK_SIZE_K 32 @@ -3792,12 +3876,12 @@ void kernel_mul_mm_impl(device const uchar * src0, device float * dst, constant int64_t & ne00, constant int64_t & ne02, - constant int64_t & nb01, - constant int64_t & nb02, + constant uint64_t & nb01, + constant uint64_t & nb02, constant int64_t & ne12, - constant int64_t & nb10, - constant int64_t & nb11, - constant int64_t & nb12, + constant uint64_t & nb10, + constant uint64_t & nb11, + constant uint64_t & nb12, constant int64_t & ne0, constant int64_t & ne1, constant uint & r2, @@ -3918,18 +4002,143 @@ void kernel_mul_mm_impl(device const uchar * src0, } } +// same as kernel_mul_mm_impl, but src1 and dst are accessed via indices stored in src1ids +template +void kernel_mul_mm_id_impl( + device const uchar * src0, + device const uchar * src1, + thread short * src1ids, + device float * dst, + constant int64_t & ne00, + constant int64_t & ne02, + constant uint64_t & nb01, + constant uint64_t & nb02, + constant int64_t & ne12, + constant uint64_t & nb10, + constant uint64_t & nb11, + constant uint64_t & nb12, + constant int64_t & ne0, + int64_t ne1, + constant uint & r2, + constant uint & r3, + threadgroup uchar * shared_memory, + uint3 tgpig[[threadgroup_position_in_grid]], + uint tiitg[[thread_index_in_threadgroup]], + uint sgitg[[simdgroup_index_in_threadgroup]]) { + + threadgroup half * sa = (threadgroup half *)(shared_memory); + threadgroup float * sb = (threadgroup float *)(shared_memory + 4096); + + const uint r0 = tgpig.y; + const uint r1 = tgpig.x; + const uint im = tgpig.z; + + if (r1 * BLOCK_SIZE_N >= ne1) return; + + // if this block is of 64x32 shape or smaller + short n_rows = (ne0 - r0 * BLOCK_SIZE_M < BLOCK_SIZE_M) ? (ne0 - r0 * BLOCK_SIZE_M) : BLOCK_SIZE_M; + short n_cols = (ne1 - r1 * BLOCK_SIZE_N < BLOCK_SIZE_N) ? (ne1 - r1 * BLOCK_SIZE_N) : BLOCK_SIZE_N; + + // a thread shouldn't load data outside of the matrix + short thread_row = ((short)tiitg/THREAD_PER_ROW) < n_rows ? ((short)tiitg/THREAD_PER_ROW) : n_rows - 1; + short thread_col = ((short)tiitg/THREAD_PER_COL) < n_cols ? ((short)tiitg/THREAD_PER_COL) : n_cols - 1; + + simdgroup_half8x8 ma[4]; + simdgroup_float8x8 mb[2]; + simdgroup_float8x8 c_res[8]; + for (int i = 0; i < 8; i++){ + c_res[i] = make_filled_simdgroup_matrix(0.f); + } + + short il = (tiitg % THREAD_PER_ROW); + + const uint i12 = im%ne12; + const uint i13 = im/ne12; + + uint offset0 = (i12/r2)*nb02 + (i13/r3)*(nb02*ne02); + ushort offset1 = il/nl; + + device const block_q * x = (device const block_q *)(src0 + (r0 * BLOCK_SIZE_M + thread_row) * nb01 + offset0) + offset1; + device const float * y = (device const float *)(src1 + + nb12 * im + + nb11 * src1ids[r1 * BLOCK_SIZE_N + thread_col] + + nb10 * (BLOCK_SIZE_K / THREAD_PER_COL * (tiitg % THREAD_PER_COL))); + + for (int loop_k = 0; loop_k < ne00; loop_k += BLOCK_SIZE_K) { + // load data and store to threadgroup memory + half4x4 temp_a; + dequantize_func(x, il, temp_a); + threadgroup_barrier(mem_flags::mem_threadgroup); + + for (int i = 0; i < 16; i++) { + *(sa + SG_MAT_SIZE * ((tiitg / THREAD_PER_ROW / 8) \ + + (tiitg % THREAD_PER_ROW) * 16 + (i / 8) * 8) \ + + (tiitg / THREAD_PER_ROW) % 8 + (i & 7) * 8) = temp_a[i/4][i%4]; + } + + *(threadgroup float2x4 *)(sb + (tiitg % THREAD_PER_COL) * 8 * 32 + 8 * (tiitg / THREAD_PER_COL)) = *((device float2x4 *)y); + + il = (il + 2 < nl) ? il + 2 : il % 2; + x = (il < 2) ? x + (2+nl-1)/nl : x; + y += BLOCK_SIZE_K; + + threadgroup_barrier(mem_flags::mem_threadgroup); + + // load matrices from threadgroup memory and conduct outer products + threadgroup half * lsma = (sa + THREAD_MAT_M * SG_MAT_SIZE * (sgitg % 2)); + threadgroup float * lsmb = (sb + THREAD_MAT_N * SG_MAT_SIZE * (sgitg / 2)); + + for (int ik = 0; ik < BLOCK_SIZE_K / 8; ik++) { + for (int i = 0; i < 4; i++) { + simdgroup_load(ma[i],lsma + SG_MAT_SIZE * i); + } + simdgroup_barrier(mem_flags::mem_none); + for (int i = 0; i < 2; i++) { + simdgroup_load(mb[i],lsmb + SG_MAT_SIZE * i); + } + + lsma += BLOCK_SIZE_M / SG_MAT_ROW * SG_MAT_SIZE; + lsmb += BLOCK_SIZE_N / SG_MAT_ROW * SG_MAT_SIZE; + + for (int i = 0; i < 8; i++){ + simdgroup_multiply_accumulate(c_res[i], mb[i/4], ma[i%4], c_res[i]); + } + } + } + + { + threadgroup_barrier(mem_flags::mem_threadgroup); + threadgroup float * temp_str = ((threadgroup float *)shared_memory) \ + + 32 * (sgitg&1) + (16 * (sgitg>>1)) * BLOCK_SIZE_M; + for (int i = 0; i < 8; i++) { + simdgroup_store(c_res[i], temp_str + 8 * (i%4) + 8 * BLOCK_SIZE_M * (i/4), BLOCK_SIZE_M); + } + + threadgroup_barrier(mem_flags::mem_threadgroup); + + device float * C = dst + (BLOCK_SIZE_M * r0) + im*ne1*ne0; + if (sgitg == 0) { + for (int i = 0; i < n_rows; i++) { + for (int j = tiitg; j < n_cols; j += BLOCK_SIZE_N) { + *(C + i + src1ids[j + r1*BLOCK_SIZE_N] * ne0) = *(temp_str + i + j * BLOCK_SIZE_M); + } + } + } + } +} + template kernel void kernel_mul_mm(device const uchar * src0, device const uchar * src1, device float * dst, constant int64_t & ne00, constant int64_t & ne02, - constant int64_t & nb01, - constant int64_t & nb02, + constant uint64_t & nb01, + constant uint64_t & nb02, constant int64_t & ne12, - constant int64_t & nb10, - constant int64_t & nb11, - constant int64_t & nb12, + constant uint64_t & nb10, + constant uint64_t & nb11, + constant uint64_t & nb12, constant int64_t & ne0, constant int64_t & ne1, constant uint & r2, @@ -3964,20 +4173,20 @@ template( - src0[id], - src1 + bid*nb11, - (device float *) (dst + bid*nb1), + for (int64_t i1 = 0; i1 < ne1; i1++) { + if (((device int32_t *) (ids + i1*nbi1))[idx] == id) { + src1ids[_ne1++] = i1; + } + } + + kernel_mul_mm_id_impl( + src0s[id], + src1, + src1ids, + dst, ne00, ne02, nb01, @@ -4014,7 +4233,7 @@ kernel void kernel_mul_mm_id( nb11, nb12, ne0, - ne1, + _ne1, r2, r3, shared_memory, @@ -4070,12 +4289,12 @@ typedef void (mat_mm_t)( device float * dst, constant int64_t & ne00, constant int64_t & ne02, - constant int64_t & nb01, - constant int64_t & nb02, + constant uint64_t & nb01, + constant uint64_t & nb02, constant int64_t & ne12, - constant int64_t & nb10, - constant int64_t & nb11, - constant int64_t & nb12, + constant uint64_t & nb10, + constant uint64_t & nb11, + constant uint64_t & nb12, constant int64_t & ne0, constant int64_t & ne1, constant uint & r2, @@ -4103,20 +4322,20 @@ template [[host_name("kernel_mul_mm_q6_K_f32")]] kernel mat_mm_t kernel_mul_mm( src0[id], (device const float *) (src1 + bid*nb11), - (device float *) ( dst + bid*nb1), + dst + bid*ne0, ne00, ne01, ne02, @@ -4416,8 +4635,8 @@ kernel void kernel_mul_mv_id_q4_0_f32( kernel void kernel_mul_mv_id_q4_1_f32( device const char * ids, device const char * src1, - device uchar * dst, - constant int64_t & nbi1, + device float * dst, + constant uint64_t & nbi1, constant int64_t & ne00, constant int64_t & ne01, constant int64_t & ne02, @@ -4433,7 +4652,7 @@ kernel void kernel_mul_mv_id_q4_1_f32( constant uint64_t & nb12, constant int64_t & ne0, constant int64_t & ne1, - constant int64_t & nb1, + constant uint64_t & nb1, constant uint & r2, constant uint & r3, constant int & idx, @@ -4460,7 +4679,7 @@ kernel void kernel_mul_mv_id_q4_1_f32( mul_vec_q_n_f32_impl( src0[id], (device const float *) (src1 + bid*nb11), - (device float *) ( dst + bid*nb1), + dst + bid*ne0, ne00, ne01, ne02, @@ -4479,8 +4698,8 @@ kernel void kernel_mul_mv_id_q4_1_f32( kernel void kernel_mul_mv_id_q5_0_f32( device const char * ids, device const char * src1, - device uchar * dst, - constant int64_t & nbi1, + device float * dst, + constant uint64_t & nbi1, constant int64_t & ne00, constant int64_t & ne01, constant int64_t & ne02, @@ -4496,7 +4715,7 @@ kernel void kernel_mul_mv_id_q5_0_f32( constant uint64_t & nb12, constant int64_t & ne0, constant int64_t & ne1, - constant int64_t & nb1, + constant uint64_t & nb1, constant uint & r2, constant uint & r3, constant int & idx, @@ -4523,7 +4742,7 @@ kernel void kernel_mul_mv_id_q5_0_f32( mul_vec_q_n_f32_impl( src0[id], (device const float *) (src1 + bid*nb11), - (device float *) ( dst + bid*nb1), + dst + bid*ne0, ne00, ne01, ne02, @@ -4542,8 +4761,8 @@ kernel void kernel_mul_mv_id_q5_0_f32( kernel void kernel_mul_mv_id_q5_1_f32( device const char * ids, device const char * src1, - device uchar * dst, - constant int64_t & nbi1, + device float * dst, + constant uint64_t & nbi1, constant int64_t & ne00, constant int64_t & ne01, constant int64_t & ne02, @@ -4559,7 +4778,7 @@ kernel void kernel_mul_mv_id_q5_1_f32( constant uint64_t & nb12, constant int64_t & ne0, constant int64_t & ne1, - constant int64_t & nb1, + constant uint64_t & nb1, constant uint & r2, constant uint & r3, constant int & idx, @@ -4586,7 +4805,7 @@ kernel void kernel_mul_mv_id_q5_1_f32( mul_vec_q_n_f32_impl( src0[id], (device const float *) (src1 + bid*nb11), - (device float *) ( dst + bid*nb1), + dst + bid*ne0, ne00, ne01, ne02, @@ -4605,8 +4824,8 @@ kernel void kernel_mul_mv_id_q5_1_f32( kernel void kernel_mul_mv_id_q2_K_f32( device const char * ids, device const char * src1, - device uchar * dst, - constant int64_t & nbi1, + device float * dst, + constant uint64_t & nbi1, constant int64_t & ne00, constant int64_t & ne01, constant int64_t & ne02, @@ -4622,7 +4841,7 @@ kernel void kernel_mul_mv_id_q2_K_f32( constant uint64_t & nb12, constant int64_t & ne0, constant int64_t & ne1, - constant int64_t & nb1, + constant uint64_t & nb1, constant uint & r2, constant uint & r3, constant int & idx, @@ -4649,7 +4868,7 @@ kernel void kernel_mul_mv_id_q2_K_f32( kernel_mul_mv_q2_K_f32_impl( src0[id], (device const float *) (src1 + bid*nb11), - (device float *) ( dst + bid*nb1), + dst + bid*ne0, ne00, ne01, ne02, @@ -4668,8 +4887,8 @@ kernel void kernel_mul_mv_id_q2_K_f32( kernel void kernel_mul_mv_id_q3_K_f32( device const char * ids, device const char * src1, - device uchar * dst, - constant int64_t & nbi1, + device float * dst, + constant uint64_t & nbi1, constant int64_t & ne00, constant int64_t & ne01, constant int64_t & ne02, @@ -4685,7 +4904,7 @@ kernel void kernel_mul_mv_id_q3_K_f32( constant uint64_t & nb12, constant int64_t & ne0, constant int64_t & ne1, - constant int64_t & nb1, + constant uint64_t & nb1, constant uint & r2, constant uint & r3, constant int & idx, @@ -4712,7 +4931,7 @@ kernel void kernel_mul_mv_id_q3_K_f32( kernel_mul_mv_q3_K_f32_impl( src0[id], (device const float *) (src1 + bid*nb11), - (device float *) ( dst + bid*nb1), + dst + bid*ne0, ne00, ne01, ne02, @@ -4731,8 +4950,8 @@ kernel void kernel_mul_mv_id_q3_K_f32( kernel void kernel_mul_mv_id_q4_K_f32( device const char * ids, device const char * src1, - device uchar * dst, - constant int64_t & nbi1, + device float * dst, + constant uint64_t & nbi1, constant int64_t & ne00, constant int64_t & ne01, constant int64_t & ne02, @@ -4748,7 +4967,7 @@ kernel void kernel_mul_mv_id_q4_K_f32( constant uint64_t & nb12, constant int64_t & ne0, constant int64_t & ne1, - constant int64_t & nb1, + constant uint64_t & nb1, constant uint & r2, constant uint & r3, constant int & idx, @@ -4775,7 +4994,7 @@ kernel void kernel_mul_mv_id_q4_K_f32( kernel_mul_mv_q4_K_f32_impl( src0[id], (device const float *) (src1 + bid*nb11), - (device float *) ( dst + bid*nb1), + dst + bid*ne0, ne00, ne01, ne02, @@ -4794,8 +5013,8 @@ kernel void kernel_mul_mv_id_q4_K_f32( kernel void kernel_mul_mv_id_q5_K_f32( device const char * ids, device const char * src1, - device uchar * dst, - constant int64_t & nbi1, + device float * dst, + constant uint64_t & nbi1, constant int64_t & ne00, constant int64_t & ne01, constant int64_t & ne02, @@ -4811,7 +5030,7 @@ kernel void kernel_mul_mv_id_q5_K_f32( constant uint64_t & nb12, constant int64_t & ne0, constant int64_t & ne1, - constant int64_t & nb1, + constant uint64_t & nb1, constant uint & r2, constant uint & r3, constant int & idx, @@ -4838,7 +5057,7 @@ kernel void kernel_mul_mv_id_q5_K_f32( kernel_mul_mv_q5_K_f32_impl( src0[id], (device const float *) (src1 + bid*nb11), - (device float *) ( dst + bid*nb1), + dst + bid*ne0, ne00, ne01, ne02, @@ -4857,8 +5076,8 @@ kernel void kernel_mul_mv_id_q5_K_f32( kernel void kernel_mul_mv_id_q6_K_f32( device const char * ids, device const char * src1, - device uchar * dst, - constant int64_t & nbi1, + device float * dst, + constant uint64_t & nbi1, constant int64_t & ne00, constant int64_t & ne01, constant int64_t & ne02, @@ -4874,7 +5093,7 @@ kernel void kernel_mul_mv_id_q6_K_f32( constant uint64_t & nb12, constant int64_t & ne0, constant int64_t & ne1, - constant int64_t & nb1, + constant uint64_t & nb1, constant uint & r2, constant uint & r3, constant int & idx, @@ -4901,7 +5120,7 @@ kernel void kernel_mul_mv_id_q6_K_f32( kernel_mul_mv_q6_K_f32_impl( src0[id], (device const float *) (src1 + bid*nb11), - (device float *) ( dst + bid*nb1), + dst + bid*ne0, ne00, ne01, ne02, diff --git a/src/whisper/ggml-quants.c b/src/whisper/ggml-quants.c index 05ef8f9..55a9496 100644 --- a/src/whisper/ggml-quants.c +++ b/src/whisper/ggml-quants.c @@ -410,13 +410,17 @@ inline static ggml_int8x16x4_t ggml_vld1q_s8_x4(const int8_t * ptr) { #if !defined(__ARM_FEATURE_DOTPROD) -inline static int32x4_t vdotq_s32(int32x4_t acc, int8x16_t a, int8x16_t b) { +inline static int32x4_t ggml_vdotq_s32(int32x4_t acc, int8x16_t a, int8x16_t b) { const int16x8_t p0 = vmull_s8(vget_low_s8 (a), vget_low_s8 (b)); const int16x8_t p1 = vmull_s8(vget_high_s8(a), vget_high_s8(b)); return vaddq_s32(acc, vaddq_s32(vpaddlq_s16(p0), vpaddlq_s16(p1))); } +#else + +#define ggml_vdotq_s32(a, b, c) vdotq_s32(a, b, c) + #endif #endif @@ -2481,8 +2485,8 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, const void * restrict vx, const int8x16_t v1_1h = vld1q_s8(y1->qs + 16); // dot product into int32x4_t - const int32x4_t p_0 = vdotq_s32(vdotq_s32(vdupq_n_s32(0), v0_0ls, v1_0l), v0_0hs, v1_0h); - const int32x4_t p_1 = vdotq_s32(vdotq_s32(vdupq_n_s32(0), v0_1ls, v1_1l), v0_1hs, v1_1h); + const int32x4_t p_0 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), v0_0ls, v1_0l), v0_0hs, v1_0h); + const int32x4_t p_1 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), v0_1ls, v1_1l), v0_1hs, v1_1h); sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(p_0), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d)); sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(p_1), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d)); @@ -2769,8 +2773,8 @@ void ggml_vec_dot_q4_1_q8_1(const int n, float * restrict s, const void * restri const int8x16_t v1_1h = vld1q_s8(y1->qs + 16); // dot product into int32x4_t - const int32x4_t p_0 = vdotq_s32(vdotq_s32(vdupq_n_s32(0), v0_0l, v1_0l), v0_0h, v1_0h); - const int32x4_t p_1 = vdotq_s32(vdotq_s32(vdupq_n_s32(0), v0_1l, v1_1l), v0_1h, v1_1h); + const int32x4_t p_0 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), v0_0l, v1_0l), v0_0h, v1_0h); + const int32x4_t p_1 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), v0_1l, v1_1l), v0_1h, v1_1h); sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(p_0), GGML_FP16_TO_FP32(x0->d)*y0->d); sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(p_1), GGML_FP16_TO_FP32(x1->d)*y1->d); @@ -2936,11 +2940,11 @@ void ggml_vec_dot_q5_0_q8_0(const int n, float * restrict s, const void * restri const int8x16_t v1_1h = vld1q_s8(y1->qs + 16); sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32( - vdotq_s32(vdupq_n_s32(0), v0_0lf, v1_0l), - vdotq_s32(vdupq_n_s32(0), v0_0hf, v1_0h))), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d)); + ggml_vdotq_s32(vdupq_n_s32(0), v0_0lf, v1_0l), + ggml_vdotq_s32(vdupq_n_s32(0), v0_0hf, v1_0h))), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d)); sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32( - vdotq_s32(vdupq_n_s32(0), v0_1lf, v1_1l), - vdotq_s32(vdupq_n_s32(0), v0_1hf, v1_1h))), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d)); + ggml_vdotq_s32(vdupq_n_s32(0), v0_1lf, v1_1l), + ggml_vdotq_s32(vdupq_n_s32(0), v0_1hf, v1_1h))), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d)); } *s = vaddvq_f32(sumv0) + vaddvq_f32(sumv1); @@ -3228,11 +3232,11 @@ void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void * restri const int8x16_t v1_1h = vld1q_s8(y1->qs + 16); sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32( - vdotq_s32(vdupq_n_s32(0), v0_0lf, v1_0l), - vdotq_s32(vdupq_n_s32(0), v0_0hf, v1_0h))), GGML_FP16_TO_FP32(x0->d)*y0->d); + ggml_vdotq_s32(vdupq_n_s32(0), v0_0lf, v1_0l), + ggml_vdotq_s32(vdupq_n_s32(0), v0_0hf, v1_0h))), GGML_FP16_TO_FP32(x0->d)*y0->d); sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32( - vdotq_s32(vdupq_n_s32(0), v0_1lf, v1_1l), - vdotq_s32(vdupq_n_s32(0), v0_1hf, v1_1h))), GGML_FP16_TO_FP32(x1->d)*y1->d); + ggml_vdotq_s32(vdupq_n_s32(0), v0_1lf, v1_1l), + ggml_vdotq_s32(vdupq_n_s32(0), v0_1hf, v1_1h))), GGML_FP16_TO_FP32(x1->d)*y1->d); } *s = vaddvq_f32(sumv0) + vaddvq_f32(sumv1) + summs0 + summs1; @@ -3483,12 +3487,12 @@ void ggml_vec_dot_q8_0_q8_0(const int n, float * restrict s, const void * restri const int8x16_t y1_1 = vld1q_s8(y1->qs + 16); sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32( - vdotq_s32(vdupq_n_s32(0), x0_0, y0_0), - vdotq_s32(vdupq_n_s32(0), x0_1, y0_1))), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d)); + ggml_vdotq_s32(vdupq_n_s32(0), x0_0, y0_0), + ggml_vdotq_s32(vdupq_n_s32(0), x0_1, y0_1))), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d)); sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32( - vdotq_s32(vdupq_n_s32(0), x1_0, y1_0), - vdotq_s32(vdupq_n_s32(0), x1_1, y1_1))), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d)); + ggml_vdotq_s32(vdupq_n_s32(0), x1_0, y1_0), + ggml_vdotq_s32(vdupq_n_s32(0), x1_1, y1_1))), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d)); } *s = vaddvq_f32(sumv0) + vaddvq_f32(sumv1); @@ -3598,8 +3602,8 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri // We use this macro instead of a function call because for some reason // the code runs 2-3% slower, even if the function is declared inline #define MULTIPLY_ACCUM_WITH_SCALE(index)\ - isum += vaddvq_s32(vdotq_s32(vzero, q2bytes.val[0], q8bytes.val[0])) * aux[is+(index)];\ - isum += vaddvq_s32(vdotq_s32(vzero, q2bytes.val[1], q8bytes.val[1])) * aux[is+1+(index)]; + isum += vaddvq_s32(ggml_vdotq_s32(vzero, q2bytes.val[0], q8bytes.val[0])) * aux[is+(index)];\ + isum += vaddvq_s32(ggml_vdotq_s32(vzero, q2bytes.val[1], q8bytes.val[1])) * aux[is+1+(index)]; #define SHIFT_MULTIPLY_ACCUM_WITH_SCALE(shift, index)\ q8bytes = ggml_vld1q_s8_x2(q8); q8 += 32;\ @@ -3973,10 +3977,10 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri q2bytes.val[2] = vreinterpretq_s8_u8(vandq_u8(vshrq_n_u8(q2bits, 4), m3)); q2bytes.val[3] = vreinterpretq_s8_u8(vandq_u8(vshrq_n_u8(q2bits, 6), m3)); - isum1 += vaddvq_s32(vdotq_s32(vzero, q2bytes.val[0], q8bytes.val[0])) * scales[0]; - isum2 += vaddvq_s32(vdotq_s32(vzero, q2bytes.val[1], q8bytes.val[1])) * scales[1]; - isum1 += vaddvq_s32(vdotq_s32(vzero, q2bytes.val[2], q8bytes.val[2])) * scales[2]; - isum2 += vaddvq_s32(vdotq_s32(vzero, q2bytes.val[3], q8bytes.val[3])) * scales[3]; + isum1 += vaddvq_s32(ggml_vdotq_s32(vzero, q2bytes.val[0], q8bytes.val[0])) * scales[0]; + isum2 += vaddvq_s32(ggml_vdotq_s32(vzero, q2bytes.val[1], q8bytes.val[1])) * scales[1]; + isum1 += vaddvq_s32(ggml_vdotq_s32(vzero, q2bytes.val[2], q8bytes.val[2])) * scales[2]; + isum2 += vaddvq_s32(ggml_vdotq_s32(vzero, q2bytes.val[3], q8bytes.val[3])) * scales[3]; sum += d * (isum1 + isum2); } @@ -4256,10 +4260,10 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri q3bytes.val[2] = vsubq_s8(vreinterpretq_s8_u8(vandq_u8(vshrq_n_u8(q3bits.val[0], 2), m3b)), vreinterpretq_s8_u8(q3h.val[2])); q3bytes.val[3] = vsubq_s8(vreinterpretq_s8_u8(vandq_u8(vshrq_n_u8(q3bits.val[1], 2), m3b)), vreinterpretq_s8_u8(q3h.val[3])); - isum += vaddvq_s32(vdotq_s32(vzero, q3bytes.val[0], q8bytes_1.val[0])) * scale[0]; - isum += vaddvq_s32(vdotq_s32(vzero, q3bytes.val[1], q8bytes_1.val[1])) * scale[1]; - isum += vaddvq_s32(vdotq_s32(vzero, q3bytes.val[2], q8bytes_1.val[2])) * scale[2]; - isum += vaddvq_s32(vdotq_s32(vzero, q3bytes.val[3], q8bytes_1.val[3])) * scale[3]; + isum += vaddvq_s32(ggml_vdotq_s32(vzero, q3bytes.val[0], q8bytes_1.val[0])) * scale[0]; + isum += vaddvq_s32(ggml_vdotq_s32(vzero, q3bytes.val[1], q8bytes_1.val[1])) * scale[1]; + isum += vaddvq_s32(ggml_vdotq_s32(vzero, q3bytes.val[2], q8bytes_1.val[2])) * scale[2]; + isum += vaddvq_s32(ggml_vdotq_s32(vzero, q3bytes.val[3], q8bytes_1.val[3])) * scale[3]; scale += 4; @@ -4273,10 +4277,10 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri q3bytes.val[2] = vsubq_s8(vreinterpretq_s8_u8(vandq_u8(vshrq_n_u8(q3bits.val[0], 6), m3b)), vreinterpretq_s8_u8(q3h.val[2])); q3bytes.val[3] = vsubq_s8(vreinterpretq_s8_u8(vandq_u8(vshrq_n_u8(q3bits.val[1], 6), m3b)), vreinterpretq_s8_u8(q3h.val[3])); - isum += vaddvq_s32(vdotq_s32(vzero, q3bytes.val[0], q8bytes_2.val[0])) * scale[0]; - isum += vaddvq_s32(vdotq_s32(vzero, q3bytes.val[1], q8bytes_2.val[1])) * scale[1]; - isum += vaddvq_s32(vdotq_s32(vzero, q3bytes.val[2], q8bytes_2.val[2])) * scale[2]; - isum += vaddvq_s32(vdotq_s32(vzero, q3bytes.val[3], q8bytes_2.val[3])) * scale[3]; + isum += vaddvq_s32(ggml_vdotq_s32(vzero, q3bytes.val[0], q8bytes_2.val[0])) * scale[0]; + isum += vaddvq_s32(ggml_vdotq_s32(vzero, q3bytes.val[1], q8bytes_2.val[1])) * scale[1]; + isum += vaddvq_s32(ggml_vdotq_s32(vzero, q3bytes.val[2], q8bytes_2.val[2])) * scale[2]; + isum += vaddvq_s32(ggml_vdotq_s32(vzero, q3bytes.val[3], q8bytes_2.val[3])) * scale[3]; scale += 4; @@ -4757,10 +4761,10 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri q3bytes.val[2] = vreinterpretq_s8_u8(vorrq_u8(vandq_u8(vshrq_n_u8(q3bits, 4), m3b), q3h.val[2])); q3bytes.val[3] = vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(q3bits, 6), q3h.val[3])); - isum += vaddvq_s32(vdotq_s32(vzero, q3bytes.val[0], q8bytes.val[0])) * scales[0]; - isum += vaddvq_s32(vdotq_s32(vzero, q3bytes.val[1], q8bytes.val[1])) * scales[2]; - isum += vaddvq_s32(vdotq_s32(vzero, q3bytes.val[2], q8bytes.val[2])) * scales[1]; - isum += vaddvq_s32(vdotq_s32(vzero, q3bytes.val[3], q8bytes.val[3])) * scales[3]; + isum += vaddvq_s32(ggml_vdotq_s32(vzero, q3bytes.val[0], q8bytes.val[0])) * scales[0]; + isum += vaddvq_s32(ggml_vdotq_s32(vzero, q3bytes.val[1], q8bytes.val[1])) * scales[2]; + isum += vaddvq_s32(ggml_vdotq_s32(vzero, q3bytes.val[2], q8bytes.val[2])) * scales[1]; + isum += vaddvq_s32(ggml_vdotq_s32(vzero, q3bytes.val[3], q8bytes.val[3])) * scales[3]; sum += d * isum; @@ -5109,14 +5113,14 @@ void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restri q4bytes.val[0] = vreinterpretq_s8_u8(vandq_u8 (q4bits.val[0], m4b)); q4bytes.val[1] = vreinterpretq_s8_u8(vandq_u8 (q4bits.val[1], m4b)); - const int32x4_t p1 = vdotq_s32(vdotq_s32(mzero, q4bytes.val[0], q8bytes.val[0]), q4bytes.val[1], q8bytes.val[1]); + const int32x4_t p1 = ggml_vdotq_s32(ggml_vdotq_s32(mzero, q4bytes.val[0], q8bytes.val[0]), q4bytes.val[1], q8bytes.val[1]); sumi1 += vaddvq_s32(p1) * scales[2*j+0]; q8bytes = ggml_vld1q_s8_x2(q8); q8 += 32; q4bytes.val[0] = vreinterpretq_s8_u8(vshrq_n_u8(q4bits.val[0], 4)); q4bytes.val[1] = vreinterpretq_s8_u8(vshrq_n_u8(q4bits.val[1], 4)); - const int32x4_t p2 = vdotq_s32(vdotq_s32(mzero, q4bytes.val[0], q8bytes.val[0]), q4bytes.val[1], q8bytes.val[1]); + const int32x4_t p2 = ggml_vdotq_s32(ggml_vdotq_s32(mzero, q4bytes.val[0], q8bytes.val[0]), q4bytes.val[1], q8bytes.val[1]); sumi2 += vaddvq_s32(p2) * scales[2*j+1]; } @@ -5449,13 +5453,13 @@ void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restri q4bytes.val[0] = vreinterpretq_s8_u8(vandq_u8 (q4bits.val[0], m4b)); q4bytes.val[1] = vreinterpretq_s8_u8(vandq_u8 (q4bits.val[1], m4b)); - const int32x4_t p1 = vdotq_s32(vdotq_s32(mzero, q4bytes.val[0], q8bytes.val[0]), q4bytes.val[1], q8bytes.val[1]); + const int32x4_t p1 = ggml_vdotq_s32(ggml_vdotq_s32(mzero, q4bytes.val[0], q8bytes.val[0]), q4bytes.val[1], q8bytes.val[1]); const int32_t sumi1 = vaddvq_s32(p1) * scales[0]; q4bytes.val[0] = vreinterpretq_s8_u8(vshrq_n_u8(q4bits.val[0], 4)); q4bytes.val[1] = vreinterpretq_s8_u8(vshrq_n_u8(q4bits.val[1], 4)); - const int32x4_t p2 = vdotq_s32(vdotq_s32(mzero, q4bytes.val[0], q8bytes.val[2]), q4bytes.val[1], q8bytes.val[3]); + const int32x4_t p2 = ggml_vdotq_s32(ggml_vdotq_s32(mzero, q4bytes.val[0], q8bytes.val[2]), q4bytes.val[1], q8bytes.val[3]); const int32_t sumi2 = vaddvq_s32(p2) * scales[1]; sumf += d * (sumi1 + sumi2); @@ -5722,8 +5726,8 @@ void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restri q5bytes.val[2] = vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(q5bits.val[0], 4), q5h.val[2])); q5bytes.val[3] = vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(q5bits.val[1], 4), q5h.val[3])); - sumi += vaddvq_s32(vdotq_s32(vdotq_s32(mzero, q5bytes.val[0], q8bytes.val[0]), q5bytes.val[1], q8bytes.val[1])) * *scales++; - sumi += vaddvq_s32(vdotq_s32(vdotq_s32(mzero, q5bytes.val[2], q8bytes.val[2]), q5bytes.val[3], q8bytes.val[3])) * *scales++; + sumi += vaddvq_s32(ggml_vdotq_s32(ggml_vdotq_s32(mzero, q5bytes.val[0], q8bytes.val[0]), q5bytes.val[1], q8bytes.val[1])) * *scales++; + sumi += vaddvq_s32(ggml_vdotq_s32(ggml_vdotq_s32(mzero, q5bytes.val[2], q8bytes.val[2]), q5bytes.val[3], q8bytes.val[3])) * *scales++; } sumf += d * sumi - dmin * sumi_mins; @@ -6112,10 +6116,10 @@ void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restri q5bytes.val[2] = vsubq_s8(vreinterpretq_s8_u8(vshrq_n_u8(q5bits.val[0], 4)), vreinterpretq_s8_u8(q5h.val[2])); q5bytes.val[3] = vsubq_s8(vreinterpretq_s8_u8(vshrq_n_u8(q5bits.val[1], 4)), vreinterpretq_s8_u8(q5h.val[3])); - int32_t sumi1 = sc[0] * vaddvq_s32(vdotq_s32(mzero, q5bytes.val[0], q8bytes.val[0])); - int32_t sumi2 = sc[1] * vaddvq_s32(vdotq_s32(mzero, q5bytes.val[1], q8bytes.val[1])); - int32_t sumi3 = sc[2] * vaddvq_s32(vdotq_s32(mzero, q5bytes.val[2], q8bytes.val[2])); - int32_t sumi4 = sc[3] * vaddvq_s32(vdotq_s32(mzero, q5bytes.val[3], q8bytes.val[3])); + int32_t sumi1 = sc[0] * vaddvq_s32(ggml_vdotq_s32(mzero, q5bytes.val[0], q8bytes.val[0])); + int32_t sumi2 = sc[1] * vaddvq_s32(ggml_vdotq_s32(mzero, q5bytes.val[1], q8bytes.val[1])); + int32_t sumi3 = sc[2] * vaddvq_s32(ggml_vdotq_s32(mzero, q5bytes.val[2], q8bytes.val[2])); + int32_t sumi4 = sc[3] * vaddvq_s32(ggml_vdotq_s32(mzero, q5bytes.val[3], q8bytes.val[3])); sumf += d * (sumi1 + sumi2 + sumi3 + sumi4); } @@ -6399,10 +6403,10 @@ void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restri q6bytes.val[2] = vreinterpretq_s8_u8(vorrq_u8(vandq_u8(q6bits.val[2], m4b), q6h.val[2])); q6bytes.val[3] = vreinterpretq_s8_u8(vorrq_u8(vandq_u8(q6bits.val[3], m4b), q6h.val[3])); - isum += vaddvq_s32(vdotq_s32(vzero, q6bytes.val[0], q8bytes.val[0])) * scale[0] + - vaddvq_s32(vdotq_s32(vzero, q6bytes.val[1], q8bytes.val[1])) * scale[1] + - vaddvq_s32(vdotq_s32(vzero, q6bytes.val[2], q8bytes.val[2])) * scale[2] + - vaddvq_s32(vdotq_s32(vzero, q6bytes.val[3], q8bytes.val[3])) * scale[3]; + isum += vaddvq_s32(ggml_vdotq_s32(vzero, q6bytes.val[0], q8bytes.val[0])) * scale[0] + + vaddvq_s32(ggml_vdotq_s32(vzero, q6bytes.val[1], q8bytes.val[1])) * scale[1] + + vaddvq_s32(ggml_vdotq_s32(vzero, q6bytes.val[2], q8bytes.val[2])) * scale[2] + + vaddvq_s32(ggml_vdotq_s32(vzero, q6bytes.val[3], q8bytes.val[3])) * scale[3]; scale += 4; @@ -6426,10 +6430,10 @@ void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restri q6bytes.val[2] = vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(q6bits.val[2], 4), q6h.val[2])); q6bytes.val[3] = vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(q6bits.val[3], 4), q6h.val[3])); - isum += vaddvq_s32(vdotq_s32(vzero, q6bytes.val[0], q8bytes.val[0])) * scale[0] + - vaddvq_s32(vdotq_s32(vzero, q6bytes.val[1], q8bytes.val[1])) * scale[1] + - vaddvq_s32(vdotq_s32(vzero, q6bytes.val[2], q8bytes.val[2])) * scale[2] + - vaddvq_s32(vdotq_s32(vzero, q6bytes.val[3], q8bytes.val[3])) * scale[3]; + isum += vaddvq_s32(ggml_vdotq_s32(vzero, q6bytes.val[0], q8bytes.val[0])) * scale[0] + + vaddvq_s32(ggml_vdotq_s32(vzero, q6bytes.val[1], q8bytes.val[1])) * scale[1] + + vaddvq_s32(ggml_vdotq_s32(vzero, q6bytes.val[2], q8bytes.val[2])) * scale[2] + + vaddvq_s32(ggml_vdotq_s32(vzero, q6bytes.val[3], q8bytes.val[3])) * scale[3]; scale += 4; } //sum += isum * d_all * y[i].d; @@ -6816,10 +6820,10 @@ void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restri q6bytes.val[2] = vsubq_s8(vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(q6bits.val[0], 4), q6h.val[2])), m32s); q6bytes.val[3] = vsubq_s8(vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(q6bits.val[1], 4), q6h.val[3])), m32s); - isum += vaddvq_s32(vdotq_s32(vzero, q6bytes.val[0], q8bytes.val[0])) * scale[0] + - vaddvq_s32(vdotq_s32(vzero, q6bytes.val[1], q8bytes.val[1])) * scale[1] + - vaddvq_s32(vdotq_s32(vzero, q6bytes.val[2], q8bytes.val[2])) * scale[2] + - vaddvq_s32(vdotq_s32(vzero, q6bytes.val[3], q8bytes.val[3])) * scale[3]; + isum += vaddvq_s32(ggml_vdotq_s32(vzero, q6bytes.val[0], q8bytes.val[0])) * scale[0] + + vaddvq_s32(ggml_vdotq_s32(vzero, q6bytes.val[1], q8bytes.val[1])) * scale[1] + + vaddvq_s32(ggml_vdotq_s32(vzero, q6bytes.val[2], q8bytes.val[2])) * scale[2] + + vaddvq_s32(ggml_vdotq_s32(vzero, q6bytes.val[3], q8bytes.val[3])) * scale[3]; sum += isum * d_all * y[i].d; diff --git a/src/whisper/ggml.c b/src/whisper/ggml.c index a9e1ea9..b124f14 100644 --- a/src/whisper/ggml.c +++ b/src/whisper/ggml.c @@ -4766,8 +4766,11 @@ struct ggml_tensor * ggml_get_rows( } // TODO: implement non F32 return - //struct ggml_tensor * result = ggml_new_tensor_2d(ctx, a->type, a->ne[0], b->ne[0]); - struct ggml_tensor * result = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, a->ne[0], b->ne[0], b->ne[1], b->ne[2]); + enum ggml_type type = GGML_TYPE_F32; + if (a->type == GGML_TYPE_I32) { + type = a->type; + } + struct ggml_tensor * result = ggml_new_tensor_4d(ctx, type, a->ne[0], b->ne[0], b->ne[1], b->ne[2]); result->op = GGML_OP_GET_ROWS; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; @@ -6938,14 +6941,165 @@ static void ggml_compute_forward_dup_f32( } } -static void ggml_compute_forward_dup( +// A simplified version of ggml_compute_forward_dup that doesn't do float upcasting, and just plain old memcpy. +static void ggml_compute_forward_dup_bytes( const struct ggml_compute_params * params, const struct ggml_tensor * src0, struct ggml_tensor * dst) { - if (ggml_is_contiguous(src0) && ggml_is_contiguous(dst) && src0->type == dst->type) { + GGML_ASSERT(ggml_nelements(dst) == ggml_nelements(src0)); + GGML_ASSERT(src0->type == dst->type); + + if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + return; + } + + if (ggml_is_contiguous(src0) && ggml_is_contiguous(dst)) { ggml_compute_forward_dup_same_cont(params, src0, dst); return; } + + GGML_TENSOR_UNARY_OP_LOCALS; + + const size_t type_size = ggml_type_size(src0->type); + const int ith = params->ith; // thread index + const int nth = params->nth; // number of threads + + + // parallelize by rows + const int nr = ne01; + // number of rows per thread + const int dr = (nr + nth - 1) / nth; + // row range for this thread + const int ir0 = dr * ith; + const int ir1 = MIN(ir0 + dr, nr); + + if (src0->type == dst->type && + ne00 == ne0 && + nb00 == type_size && nb0 == type_size) { + // copy by rows + const size_t rs = ne00 * type_size; + for (int64_t i03 = 0; i03 < ne03; i03++) { + for (int64_t i02 = 0; i02 < ne02; i02++) { + for (int64_t i01 = ir0; i01 < ir1; i01++) { + memcpy( + ((char *) dst->data + i01*nb1 + i02*nb2 + i03*nb3), + ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03), + rs); + } + } + } + return; + } + + if (ggml_is_contiguous(dst)) { + size_t id = 0; + char * dst_ptr = (char *) dst->data; + const size_t rs = ne00 * type_size; + + if (nb00 == type_size) { + // src0 is contigous on first dimension, copy by rows + for (int64_t i03 = 0; i03 < ne03; i03++) { + for (int64_t i02 = 0; i02 < ne02; i02++) { + id += rs * ir0; + for (int64_t i01 = ir0; i01 < ir1; i01++) { + const char * src0_ptr = (char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03; + memcpy(dst_ptr + id, src0_ptr, rs); + id += rs; + } + id += rs * (ne01 - ir1); + } + } + } else { + //printf("%s: this is not optimal - fix me\n", __func__); + + for (int64_t i03 = 0; i03 < ne03; i03++) { + for (int64_t i02 = 0; i02 < ne02; i02++) { + id += rs * ir0; + for (int64_t i01 = ir0; i01 < ir1; i01++) { + for (int64_t i00 = 0; i00 < ne00; i00++) { + const char * src0_ptr = (char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03; + memcpy(dst_ptr + id, src0_ptr, type_size); + + id += type_size; + } + } + id += rs * (ne01 - ir1); + } + } + } + + return; + } + + // dst counters + + int64_t i10 = 0; + int64_t i11 = 0; + int64_t i12 = 0; + int64_t i13 = 0; + + for (int64_t i03 = 0; i03 < ne03; i03++) { + for (int64_t i02 = 0; i02 < ne02; i02++) { + i10 += ne00 * ir0; + while (i10 >= ne0) { + i10 -= ne0; + if (++i11 == ne1) { + i11 = 0; + if (++i12 == ne2) { + i12 = 0; + if (++i13 == ne3) { + i13 = 0; + } + } + } + } + for (int64_t i01 = ir0; i01 < ir1; i01++) { + for (int64_t i00 = 0; i00 < ne00; i00++) { + const char * src0_ptr = ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03); + char * dst_ptr = ((char *) dst->data + i10*nb0 + i11*nb1 + i12*nb2 + i13*nb3); + + memcpy(dst_ptr, src0_ptr, type_size); + + if (++i10 == ne0) { + i10 = 0; + if (++i11 == ne1) { + i11 = 0; + if (++i12 == ne2) { + i12 = 0; + if (++i13 == ne3) { + i13 = 0; + } + } + } + } + } + } + i10 += ne00 * (ne01 - ir1); + while (i10 >= ne0) { + i10 -= ne0; + if (++i11 == ne1) { + i11 = 0; + if (++i12 == ne2) { + i12 = 0; + if (++i13 == ne3) { + i13 = 0; + } + } + } + } + } + } +} + +static void ggml_compute_forward_dup( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + struct ggml_tensor * dst) { + if (src0->type == dst->type) { + ggml_compute_forward_dup_bytes(params, src0, dst); + return; + } + switch (src0->type) { case GGML_TYPE_F16: { @@ -8404,10 +8558,12 @@ static void ggml_compute_forward_repeat( struct ggml_tensor * dst) { switch (src0->type) { case GGML_TYPE_F16: + case GGML_TYPE_I16: { ggml_compute_forward_repeat_f16(params, src0, dst); } break; case GGML_TYPE_F32: + case GGML_TYPE_I32: { ggml_compute_forward_repeat_f32(params, src0, dst); } break; @@ -8550,6 +8706,7 @@ static void ggml_compute_forward_concat( struct ggml_tensor* dst) { switch (src0->type) { case GGML_TYPE_F32: + case GGML_TYPE_I32: { ggml_compute_forward_concat_f32(params, src0, src1, dst); } break; @@ -10674,6 +10831,7 @@ static void ggml_compute_forward_get_rows( ggml_compute_forward_get_rows_f16(params, src0, src1, dst); } break; case GGML_TYPE_F32: + case GGML_TYPE_I32: { ggml_compute_forward_get_rows_f32(params, src0, src1, dst); } break; @@ -19638,6 +19796,14 @@ int ggml_cpu_has_avx(void) { #endif } +int ggml_cpu_has_avx_vnni(void) { +#if defined(__AVXVNNI__) + return 1; +#else + return 0; +#endif +} + int ggml_cpu_has_avx2(void) { #if defined(__AVX2__) return 1; diff --git a/src/whisper/ggml.h b/src/whisper/ggml.h index 67d6bc4..64f4e45 100644 --- a/src/whisper/ggml.h +++ b/src/whisper/ggml.h @@ -2198,6 +2198,7 @@ extern "C" { // GGML_API int ggml_cpu_has_avx (void); + GGML_API int ggml_cpu_has_avx_vnni (void); GGML_API int ggml_cpu_has_avx2 (void); GGML_API int ggml_cpu_has_avx512 (void); GGML_API int ggml_cpu_has_avx512_vbmi(void); diff --git a/src/whisper/whisper.cpp b/src/whisper/whisper.cpp index 5c64410..4f216a9 100644 --- a/src/whisper/whisper.cpp +++ b/src/whisper/whisper.cpp @@ -152,7 +152,7 @@ static void whisper_log_callback_default(ggml_log_level level, const char * text // ggml helpers // -static void ggml_graph_compute_helper( +static bool ggml_graph_compute_helper( struct ggml_cgraph * graph, std::vector & buf, int n_threads, @@ -168,10 +168,10 @@ static void ggml_graph_compute_helper( plan.work_data = buf.data(); } - ggml_graph_compute(graph, &plan); + return ggml_graph_compute(graph, &plan); } -static void ggml_graph_compute_helper( +static bool ggml_graph_compute_helper( struct ggml_backend * backend, struct ggml_cgraph * graph, int n_threads) { @@ -183,7 +183,7 @@ static void ggml_graph_compute_helper( ggml_backend_metal_set_n_cb(backend, n_threads); } #endif - ggml_backend_graph_compute(backend, graph); + return ggml_backend_graph_compute(backend, graph); } // faster matrix multiplications for tensors that do not have dimension 0 divisible by "pad" @@ -2103,7 +2103,9 @@ static bool whisper_encode_internal( ggml_allocr_alloc_graph(alloc, gf); if (!whisper_encode_external(wstate)) { - ggml_graph_compute_helper(wstate.backend, gf, n_threads); + if (!ggml_graph_compute_helper(wstate.backend, gf, n_threads)) { + return false; + } } } @@ -2117,7 +2119,9 @@ static bool whisper_encode_internal( ggml_allocr_alloc_graph(alloc, gf); - ggml_graph_compute_helper(wstate.backend, gf, n_threads); + if (!ggml_graph_compute_helper(wstate.backend, gf, n_threads)) { + return false; + } } // cross @@ -2130,7 +2134,9 @@ static bool whisper_encode_internal( ggml_allocr_alloc_graph(alloc, gf); - ggml_graph_compute_helper(wstate.backend, gf, n_threads); + if (!ggml_graph_compute_helper(wstate.backend, gf, n_threads)) { + return false; + } } wstate.t_encode_us += ggml_time_us() - t_start_us; @@ -2552,7 +2558,9 @@ static bool whisper_decode_internal( logits = gf->nodes[gf->n_nodes - 1]; - ggml_graph_compute_helper(wstate.backend, gf, n_threads); + if (!ggml_graph_compute_helper(wstate.backend, gf, n_threads)) { + return false; + } } logits_out.resize(n_tokens*n_vocab);