Enable CUDA Graphs for gemma3n.

ORippler · ORippler · commit 9d28be80a800 · 2025-07-22T11:41:56.000+02:00
Similar to ggml-org/llama.cpp#14741, though ollama has a slightly different model graph than llama.cpp which requires different workaround checks.
diff --git a/llama/patches/0019-metal-add-mean-kernel-14267.patch b/llama/patches/0019-metal-add-mean-kernel-14267.patch
@@ -16,7 +16,7 @@ ggml-ci
  2 files changed, 67 insertions(+), 14 deletions(-)
 
 diff --git a/ggml/src/ggml-metal/ggml-metal.m b/ggml/src/ggml-metal/ggml-metal.m
-index ee4f2dcb..f20f5615 100644
+index a9eeebc6..110c9ece 100644
 --- a/ggml/src/ggml-metal/ggml-metal.m
 +++ b/ggml/src/ggml-metal/ggml-metal.m
 @@ -489,6 +489,7 @@ enum ggml_metal_kernel_type {
diff --git a/llama/patches/0020-CUDA-add-mean-operation-14313.patch b/llama/patches/0020-CUDA-add-mean-operation-14313.patch
@@ -52,7 +52,7 @@ index 64fb4ff4..5b9a0fe3 100644
  static __device__ __forceinline__ float warp_reduce_max(float x) {
  #pragma unroll
 diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
-index 4c829153..9e64e5ae 100644
+index d6960174..2b9fabf4 100644
 --- a/ggml/src/ggml-cuda/ggml-cuda.cu
 +++ b/ggml/src/ggml-cuda/ggml-cuda.cu
 @@ -35,6 +35,7 @@
diff --git a/llama/patches/0021-Enable-CUDA-Graphs-for-gemma3n.patch b/llama/patches/0021-Enable-CUDA-Graphs-for-gemma3n.patch
@@ -0,0 +1,56 @@
+From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
+From: Oliver Simons <osimons@nvidia.com>
+Date: Tue, 22 Jul 2025 11:02:28 +0200
+Subject: [PATCH] Enable CUDA Graphs for gemma3n.
+
+Similar to
+https://github.com/ggml-org/llama.cpp/pull/14741,
+though ollama has a slightly different model graph
+than llama.cpp which requires different workaround
+checks.
+---
+ ggml/src/ggml-cuda/ggml-cuda.cu | 22 ++++++++++++++++++----
+ 1 file changed, 18 insertions(+), 4 deletions(-)
+
+diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
+index 2b9fabf4..e7e8798b 100644
+--- a/ggml/src/ggml-cuda/ggml-cuda.cu
++++ b/ggml/src/ggml-cuda/ggml-cuda.cu
+@@ -2474,6 +2474,10 @@ static bool check_node_graph_compatibility_and_refresh_copy_ops(ggml_backend_cud
+     // Loop over nodes in GGML graph to obtain info needed for CUDA graph
+     cuda_ctx->cuda_graph->cpy_dest_ptrs.clear();
+ 
++    const std::string gemma3n_hidden_state_ops_src1_name = " (permuted) (cont)";
++    const std::string gemma3n_per_layer_proj_src1_name   = " (reshaped)";
++    const std::string gemma3n_node_name                  = "node_";
++
+     for (int i = 0; i < cgraph->n_nodes; i++) {
+         ggml_tensor * node = cgraph->nodes[i];
+ 
+@@ -2496,12 +2500,22 @@ static bool check_node_graph_compatibility_and_refresh_copy_ops(ggml_backend_cud
+         }
+ 
+         if (node->op == GGML_OP_ADD && node->src[1] && node->src[1]->ne[1] > 1) {
+-            // disable CUDA graphs for batch size > 1 for now.
+-            // Changes in batch size or context size can cause changes to the grid size of some kernels.
+-            use_cuda_graph = false;
++            // workarounds to exclude Gemma3n's `project_per_layer_input` operation and its hidden state operations from the batch-size heuristic, specific to ollama
++            // number of layers is different for per_layer_proj between gemma3n:2b and gemma3n:4b, which is why we don't check that value here
++            if (!((node->ne[0] == 4 && node->ne[1] == 2048 && node->ne[2] == 1 && node->ne[3] == 1 && node->src[0] ?
++                       std::string(node->src[0]->name).find(gemma3n_node_name) != std::string::npos :
++                   false && node->src[1] ? node->src[1]->name == gemma3n_hidden_state_ops_src1_name :
++                                           false) ||
++                  (node->ne[0] == 256 && node->ne[2] == 1 && node->ne[3] == 1 && node->src[0] ?
++                       std::string(node->src[0]->name).find(gemma3n_node_name) != std::string::npos :
++                   false && node->src[1] ? node->src[1]->name == gemma3n_per_layer_proj_src1_name :
++                                           false))) {
++                // Generally, changes in batch size or context size can cause changes to the grid size of some kernels.
++                use_cuda_graph = false;
+ #ifndef NDEBUG
+-            GGML_LOG_DEBUG("%s: disabling CUDA graphs due to batch size > 1 [%s] [%ld %ld %ld %ld]\n", __func__, node->name, node->ne[0], node->ne[1], node->ne[2], node->ne[3]);
++                GGML_LOG_DEBUG("%s: disabling CUDA graphs due to batch size > 1 [%s] [%ld %ld %ld %ld]\n", __func__, node->name, node->ne[0], node->ne[1], node->ne[2], node->ne[3]);
+ #endif
++            }
+         }
+ 
+         if (node->op == GGML_OP_CPY) {
diff --git a/ml/backend/ggml/ggml/src/ggml-cuda/ggml-cuda.cu b/ml/backend/ggml/ggml/src/ggml-cuda/ggml-cuda.cu
@@ -2474,6 +2474,10 @@ static bool check_node_graph_compatibility_and_refresh_copy_ops(ggml_backend_cud
     // Loop over nodes in GGML graph to obtain info needed for CUDA graph
     cuda_ctx->cuda_graph->cpy_dest_ptrs.clear();
 
+    const std::string gemma3n_hidden_state_ops_src1_name = " (permuted) (cont)";
+    const std::string gemma3n_per_layer_proj_src1_name   = " (reshaped)";
+    const std::string gemma3n_node_name                  = "node_";
+
     for (int i = 0; i < cgraph->n_nodes; i++) {
         ggml_tensor * node = cgraph->nodes[i];
 
@@ -2496,12 +2500,22 @@ static bool check_node_graph_compatibility_and_refresh_copy_ops(ggml_backend_cud
         }
 
         if (node->op == GGML_OP_ADD && node->src[1] && node->src[1]->ne[1] > 1) {
-            // disable CUDA graphs for batch size > 1 for now.
-            // Changes in batch size or context size can cause changes to the grid size of some kernels.
-            use_cuda_graph = false;
+            // workarounds to exclude Gemma3n's `project_per_layer_input` operation and its hidden state operations from the batch-size heuristic, specific to ollama
+            // number of layers is different for per_layer_proj between gemma3n:2b and gemma3n:4b, which is why we don't check that value here
+            if (!((node->ne[0] == 4 && node->ne[1] == 2048 && node->ne[2] == 1 && node->ne[3] == 1 && node->src[0] ?
+                       std::string(node->src[0]->name).find(gemma3n_node_name) != std::string::npos :
+                   false && node->src[1] ? node->src[1]->name == gemma3n_hidden_state_ops_src1_name :
+                                           false) ||
+                  (node->ne[0] == 256 && node->ne[2] == 1 && node->ne[3] == 1 && node->src[0] ?
+                       std::string(node->src[0]->name).find(gemma3n_node_name) != std::string::npos :
+                   false && node->src[1] ? node->src[1]->name == gemma3n_per_layer_proj_src1_name :
+                                           false))) {
+                // Generally, changes in batch size or context size can cause changes to the grid size of some kernels.
+                use_cuda_graph = false;
 #ifndef NDEBUG
-            GGML_LOG_DEBUG("%s: disabling CUDA graphs due to batch size > 1 [%s] [%ld %ld %ld %ld]\n", __func__, node->name, node->ne[0], node->ne[1], node->ne[2], node->ne[3]);
+                GGML_LOG_DEBUG("%s: disabling CUDA graphs due to batch size > 1 [%s] [%ld %ld %ld %ld]\n", __func__, node->name, node->ne[0], node->ne[1], node->ne[2], node->ne[3]);
 #endif
+            }
         }
 
         if (node->op == GGML_OP_CPY) {