Skip to content

Commit 9d28be8

Browse files
committed
Enable CUDA Graphs for gemma3n.
Similar to ggml-org/llama.cpp#14741, though ollama has a slightly different model graph than llama.cpp which requires different workaround checks.
1 parent 191d942 commit 9d28be8

File tree

4 files changed

+76
-6
lines changed

4 files changed

+76
-6
lines changed

llama/patches/0019-metal-add-mean-kernel-14267.patch

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@ ggml-ci
1616
2 files changed, 67 insertions(+), 14 deletions(-)
1717

1818
diff --git a/ggml/src/ggml-metal/ggml-metal.m b/ggml/src/ggml-metal/ggml-metal.m
19-
index ee4f2dcb..f20f5615 100644
19+
index a9eeebc6..110c9ece 100644
2020
--- a/ggml/src/ggml-metal/ggml-metal.m
2121
+++ b/ggml/src/ggml-metal/ggml-metal.m
2222
@@ -489,6 +489,7 @@ enum ggml_metal_kernel_type {

llama/patches/0020-CUDA-add-mean-operation-14313.patch

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -52,7 +52,7 @@ index 64fb4ff4..5b9a0fe3 100644
5252
static __device__ __forceinline__ float warp_reduce_max(float x) {
5353
#pragma unroll
5454
diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
55-
index 4c829153..9e64e5ae 100644
55+
index d6960174..2b9fabf4 100644
5656
--- a/ggml/src/ggml-cuda/ggml-cuda.cu
5757
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
5858
@@ -35,6 +35,7 @@
Lines changed: 56 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,56 @@
1+
From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
2+
From: Oliver Simons <osimons@nvidia.com>
3+
Date: Tue, 22 Jul 2025 11:02:28 +0200
4+
Subject: [PATCH] Enable CUDA Graphs for gemma3n.
5+
6+
Similar to
7+
https://github.com/ggml-org/llama.cpp/pull/14741,
8+
though ollama has a slightly different model graph
9+
than llama.cpp which requires different workaround
10+
checks.
11+
---
12+
ggml/src/ggml-cuda/ggml-cuda.cu | 22 ++++++++++++++++++----
13+
1 file changed, 18 insertions(+), 4 deletions(-)
14+
15+
diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
16+
index 2b9fabf4..e7e8798b 100644
17+
--- a/ggml/src/ggml-cuda/ggml-cuda.cu
18+
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
19+
@@ -2474,6 +2474,10 @@ static bool check_node_graph_compatibility_and_refresh_copy_ops(ggml_backend_cud
20+
// Loop over nodes in GGML graph to obtain info needed for CUDA graph
21+
cuda_ctx->cuda_graph->cpy_dest_ptrs.clear();
22+
23+
+ const std::string gemma3n_hidden_state_ops_src1_name = " (permuted) (cont)";
24+
+ const std::string gemma3n_per_layer_proj_src1_name = " (reshaped)";
25+
+ const std::string gemma3n_node_name = "node_";
26+
+
27+
for (int i = 0; i < cgraph->n_nodes; i++) {
28+
ggml_tensor * node = cgraph->nodes[i];
29+
30+
@@ -2496,12 +2500,22 @@ static bool check_node_graph_compatibility_and_refresh_copy_ops(ggml_backend_cud
31+
}
32+
33+
if (node->op == GGML_OP_ADD && node->src[1] && node->src[1]->ne[1] > 1) {
34+
- // disable CUDA graphs for batch size > 1 for now.
35+
- // Changes in batch size or context size can cause changes to the grid size of some kernels.
36+
- use_cuda_graph = false;
37+
+ // workarounds to exclude Gemma3n's `project_per_layer_input` operation and its hidden state operations from the batch-size heuristic, specific to ollama
38+
+ // number of layers is different for per_layer_proj between gemma3n:2b and gemma3n:4b, which is why we don't check that value here
39+
+ if (!((node->ne[0] == 4 && node->ne[1] == 2048 && node->ne[2] == 1 && node->ne[3] == 1 && node->src[0] ?
40+
+ std::string(node->src[0]->name).find(gemma3n_node_name) != std::string::npos :
41+
+ false && node->src[1] ? node->src[1]->name == gemma3n_hidden_state_ops_src1_name :
42+
+ false) ||
43+
+ (node->ne[0] == 256 && node->ne[2] == 1 && node->ne[3] == 1 && node->src[0] ?
44+
+ std::string(node->src[0]->name).find(gemma3n_node_name) != std::string::npos :
45+
+ false && node->src[1] ? node->src[1]->name == gemma3n_per_layer_proj_src1_name :
46+
+ false))) {
47+
+ // Generally, changes in batch size or context size can cause changes to the grid size of some kernels.
48+
+ use_cuda_graph = false;
49+
#ifndef NDEBUG
50+
- GGML_LOG_DEBUG("%s: disabling CUDA graphs due to batch size > 1 [%s] [%ld %ld %ld %ld]\n", __func__, node->name, node->ne[0], node->ne[1], node->ne[2], node->ne[3]);
51+
+ GGML_LOG_DEBUG("%s: disabling CUDA graphs due to batch size > 1 [%s] [%ld %ld %ld %ld]\n", __func__, node->name, node->ne[0], node->ne[1], node->ne[2], node->ne[3]);
52+
#endif
53+
+ }
54+
}
55+
56+
if (node->op == GGML_OP_CPY) {

ml/backend/ggml/ggml/src/ggml-cuda/ggml-cuda.cu

Lines changed: 18 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -2474,6 +2474,10 @@ static bool check_node_graph_compatibility_and_refresh_copy_ops(ggml_backend_cud
24742474
// Loop over nodes in GGML graph to obtain info needed for CUDA graph
24752475
cuda_ctx->cuda_graph->cpy_dest_ptrs.clear();
24762476

2477+
const std::string gemma3n_hidden_state_ops_src1_name = " (permuted) (cont)";
2478+
const std::string gemma3n_per_layer_proj_src1_name = " (reshaped)";
2479+
const std::string gemma3n_node_name = "node_";
2480+
24772481
for (int i = 0; i < cgraph->n_nodes; i++) {
24782482
ggml_tensor * node = cgraph->nodes[i];
24792483

@@ -2496,12 +2500,22 @@ static bool check_node_graph_compatibility_and_refresh_copy_ops(ggml_backend_cud
24962500
}
24972501

24982502
if (node->op == GGML_OP_ADD && node->src[1] && node->src[1]->ne[1] > 1) {
2499-
// disable CUDA graphs for batch size > 1 for now.
2500-
// Changes in batch size or context size can cause changes to the grid size of some kernels.
2501-
use_cuda_graph = false;
2503+
// workarounds to exclude Gemma3n's `project_per_layer_input` operation and its hidden state operations from the batch-size heuristic, specific to ollama
2504+
// number of layers is different for per_layer_proj between gemma3n:2b and gemma3n:4b, which is why we don't check that value here
2505+
if (!((node->ne[0] == 4 && node->ne[1] == 2048 && node->ne[2] == 1 && node->ne[3] == 1 && node->src[0] ?
2506+
std::string(node->src[0]->name).find(gemma3n_node_name) != std::string::npos :
2507+
false && node->src[1] ? node->src[1]->name == gemma3n_hidden_state_ops_src1_name :
2508+
false) ||
2509+
(node->ne[0] == 256 && node->ne[2] == 1 && node->ne[3] == 1 && node->src[0] ?
2510+
std::string(node->src[0]->name).find(gemma3n_node_name) != std::string::npos :
2511+
false && node->src[1] ? node->src[1]->name == gemma3n_per_layer_proj_src1_name :
2512+
false))) {
2513+
// Generally, changes in batch size or context size can cause changes to the grid size of some kernels.
2514+
use_cuda_graph = false;
25022515
#ifndef NDEBUG
2503-
GGML_LOG_DEBUG("%s: disabling CUDA graphs due to batch size > 1 [%s] [%ld %ld %ld %ld]\n", __func__, node->name, node->ne[0], node->ne[1], node->ne[2], node->ne[3]);
2516+
GGML_LOG_DEBUG("%s: disabling CUDA graphs due to batch size > 1 [%s] [%ld %ld %ld %ld]\n", __func__, node->name, node->ne[0], node->ne[1], node->ne[2], node->ne[3]);
25042517
#endif
2518+
}
25052519
}
25062520

25072521
if (node->op == GGML_OP_CPY) {

0 commit comments

Comments
 (0)