|
| 1 | +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 |
| 2 | +From: Oliver Simons <osimons@nvidia.com> |
| 3 | +Date: Tue, 22 Jul 2025 11:02:28 +0200 |
| 4 | +Subject: [PATCH] Enable CUDA Graphs for gemma3n. |
| 5 | + |
| 6 | +Similar to |
| 7 | +https://github.com/ggml-org/llama.cpp/pull/14741, |
| 8 | +though ollama has a slightly different model graph |
| 9 | +than llama.cpp which requires different workaround |
| 10 | +checks. |
| 11 | +--- |
| 12 | + ggml/src/ggml-cuda/ggml-cuda.cu | 22 ++++++++++++++++++---- |
| 13 | + 1 file changed, 18 insertions(+), 4 deletions(-) |
| 14 | + |
| 15 | +diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu |
| 16 | +index 2b9fabf4..e7e8798b 100644 |
| 17 | +--- a/ggml/src/ggml-cuda/ggml-cuda.cu |
| 18 | ++++ b/ggml/src/ggml-cuda/ggml-cuda.cu |
| 19 | +@@ -2474,6 +2474,10 @@ static bool check_node_graph_compatibility_and_refresh_copy_ops(ggml_backend_cud |
| 20 | + // Loop over nodes in GGML graph to obtain info needed for CUDA graph |
| 21 | + cuda_ctx->cuda_graph->cpy_dest_ptrs.clear(); |
| 22 | + |
| 23 | ++ const std::string gemma3n_hidden_state_ops_src1_name = " (permuted) (cont)"; |
| 24 | ++ const std::string gemma3n_per_layer_proj_src1_name = " (reshaped)"; |
| 25 | ++ const std::string gemma3n_node_name = "node_"; |
| 26 | ++ |
| 27 | + for (int i = 0; i < cgraph->n_nodes; i++) { |
| 28 | + ggml_tensor * node = cgraph->nodes[i]; |
| 29 | + |
| 30 | +@@ -2496,12 +2500,22 @@ static bool check_node_graph_compatibility_and_refresh_copy_ops(ggml_backend_cud |
| 31 | + } |
| 32 | + |
| 33 | + if (node->op == GGML_OP_ADD && node->src[1] && node->src[1]->ne[1] > 1) { |
| 34 | +- // disable CUDA graphs for batch size > 1 for now. |
| 35 | +- // Changes in batch size or context size can cause changes to the grid size of some kernels. |
| 36 | +- use_cuda_graph = false; |
| 37 | ++ // workarounds to exclude Gemma3n's `project_per_layer_input` operation and its hidden state operations from the batch-size heuristic, specific to ollama |
| 38 | ++ // number of layers is different for per_layer_proj between gemma3n:2b and gemma3n:4b, which is why we don't check that value here |
| 39 | ++ if (!((node->ne[0] == 4 && node->ne[1] == 2048 && node->ne[2] == 1 && node->ne[3] == 1 && node->src[0] ? |
| 40 | ++ std::string(node->src[0]->name).find(gemma3n_node_name) != std::string::npos : |
| 41 | ++ false && node->src[1] ? node->src[1]->name == gemma3n_hidden_state_ops_src1_name : |
| 42 | ++ false) || |
| 43 | ++ (node->ne[0] == 256 && node->ne[2] == 1 && node->ne[3] == 1 && node->src[0] ? |
| 44 | ++ std::string(node->src[0]->name).find(gemma3n_node_name) != std::string::npos : |
| 45 | ++ false && node->src[1] ? node->src[1]->name == gemma3n_per_layer_proj_src1_name : |
| 46 | ++ false))) { |
| 47 | ++ // Generally, changes in batch size or context size can cause changes to the grid size of some kernels. |
| 48 | ++ use_cuda_graph = false; |
| 49 | + #ifndef NDEBUG |
| 50 | +- GGML_LOG_DEBUG("%s: disabling CUDA graphs due to batch size > 1 [%s] [%ld %ld %ld %ld]\n", __func__, node->name, node->ne[0], node->ne[1], node->ne[2], node->ne[3]); |
| 51 | ++ GGML_LOG_DEBUG("%s: disabling CUDA graphs due to batch size > 1 [%s] [%ld %ld %ld %ld]\n", __func__, node->name, node->ne[0], node->ne[1], node->ne[2], node->ne[3]); |
| 52 | + #endif |
| 53 | ++ } |
| 54 | + } |
| 55 | + |
| 56 | + if (node->op == GGML_OP_CPY) { |
0 commit comments