whisper : add description of ggml_mul_mat_pad

ggerganov · ggerganov · commit b38f8a4417cf · 2023-09-14T15:36:36.000+03:00
diff --git a/coreml/whisper-encoder.mm b/coreml/whisper-encoder.mm
@@ -24,8 +24,8 @@
 
     // select which device to run the Core ML model on
     MLModelConfiguration *config = [[MLModelConfiguration alloc] init];
-    //config.computeUnits = MLComputeUnitsCPUAndGPU;
-    config.computeUnits = MLComputeUnitsCPUAndNeuralEngine;
+    config.computeUnits = MLComputeUnitsCPUAndGPU;
+    //config.computeUnits = MLComputeUnitsCPUAndNeuralEngine;
     //config.computeUnits = MLComputeUnitsAll;
 
     const void * data = CFBridgingRetain([[whisper_encoder_impl alloc] initWithContentsOfURL:url_model configuration:config error:nil]);
diff --git a/whisper.cpp b/whisper.cpp
@@ -136,6 +136,19 @@ static void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph *
     ggml_graph_compute(graph, &plan);
 }
 
+// faster matrix multiplications for tensors that do not have dimension 0 divisible "pad"
+// the idea is to represent the original matrix multiplication:
+//
+//   Z = X @ Y
+//
+// with two matrix multiplications:
+//
+//   Z = [X_0; X_1] @ [Y_0; Y_1]
+//
+// here X_0 and Y_0 are views of X and Y that have dimension 0 divisible by "pad"
+// and X_1 and Y_1 are the remaining views. X_1 and Y_1 end up being small matrices that can be processed with more
+// general-purpose kernels
+//
 static struct ggml_tensor * ggml_mul_mat_pad(struct ggml_context * ctx, struct ggml_tensor * x, struct ggml_tensor * y, int pad = 32) {
 //#if !defined(GGML_USE_METAL)
 //    return ggml_mul_mat(ctx, x, y);