@@ -2589,7 +2589,9 @@ static bool check_node_graph_compatibility_and_refresh_copy_ops(ggml_backend_cud
25892589
25902590 // Loop over nodes in GGML graph to obtain info needed for CUDA graph
25912591 cuda_ctx->cuda_graph ->cpy_dest_ptrs .clear ();
2592- std::uint8_t batch_size_counter = 0 ;
2592+
2593+ const std::string gemma3n_per_layer_proj_src0_name = " inp_per_layer_selected" ;
2594+ const std::string gemma3n_per_layer_proj_src1_name = " per_layer_proj" ;
25932595
25942596 for (int i = 0 ; i < cgraph->n_nodes ; i++) {
25952597 ggml_tensor * node = cgraph->nodes [i];
@@ -2612,19 +2614,16 @@ static bool check_node_graph_compatibility_and_refresh_copy_ops(ggml_backend_cud
26122614#endif
26132615 }
26142616
2615- if (node->op == GGML_OP_ADD && node->src [1 ] && node->src [1 ]->ne [1 ] > 1 ) {
2616- // disable CUDA graphs for batch size > 1 for now. The heuristic here allows to use CUDA graphs
2617- // for Gemma3n, which uses a single Matrix-Matrix Addition as part of `project_per_layer_input`, while detecting
2618- // batched execution for all graphs with >1 GGML_OP_ADD nodes. See also
2617+ if (node->op == GGML_OP_ADD && node->src [1 ] && node->src [1 ]->ne [1 ] > 1 && (node-> src [ 0 ] ? node-> src [ 0 ]-> name != gemma3n_per_layer_proj_src0_name : true ) && (node-> src [ 1 ] ? node-> src [ 1 ]-> name != gemma3n_per_layer_proj_src1_name : true ) ) {
2618+ // disable CUDA graphs for batch size > 1 for now while excluding the matrix-matrix addition as part of Gemma3n's `project_per_layer_input` operation
2619+ // by means of matching node names. See
2620+ // https://github.com/ggml-org/llama.cpp/blob/f9a31eea06a859e34cecb88b4d020c7f03d86cc4/src/llama-model.cpp#L10199-L10241 and
26192621 // https://github.com/huggingface/transformers/blob/bda75b4011239d065de84aa3e744b67ebfa7b245/src/transformers/models/gemma3n/modeling_gemma3n.py#L1773,
26202622 // Generally, changes in batch size or context size can cause changes to the grid size of some kernels.
2621- ++batch_size_counter;
2622- if (batch_size_counter > 1 ) {
2623- use_cuda_graph = false ;
2624- #ifndef NDEBUG
2625- GGML_LOG_DEBUG (" %s: disabling CUDA graphs due to repeated batch size > 1 [%s] [%ld %ld %ld %ld]\n " , __func__, node->name , node->ne [0 ], node->ne [1 ], node->ne [2 ], node->ne [3 ]);
2626- #endif
2627- }
2623+ use_cuda_graph = false ;
2624+ #ifndef NDEBUG
2625+ GGML_LOG_DEBUG (" %s: disabling CUDA graphs due to batch size > 1 [%s] [%ld %ld %ld %ld]\n " , __func__, node->name , node->ne [0 ], node->ne [1 ], node->ne [2 ], node->ne [3 ]);
2626+ #endif
26282627 }
26292628
26302629 if (node->op == GGML_OP_CPY) {
0 commit comments