Skip to content

Commit ce2071b

Browse files
anavp-nvidiapwilkin
authored andcommitted
cuda : Enable CUDA Graph usage for Nemotron Nano v2 (NemotronH) (ggml-org#16328)
* Fix Nemotron Nano v2 9B not executing as CUDA Graph on NVIDIA GPUs * fix to ensure test-backend-ops check passes
1 parent 79ec093 commit ce2071b

File tree

3 files changed

+20
-4
lines changed

3 files changed

+20
-4
lines changed

ggml/src/ggml-cuda/cpy.cu

Lines changed: 12 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -329,7 +329,11 @@ void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, gg
329329
} else
330330
#endif // GGML_USE_MUSA && GGML_MUSA_MUDNN_COPY
331331
{
332-
CUDA_CHECK(cudaMemcpyAsync(src1_ddc, src0_ddc, ggml_nbytes(src0), cudaMemcpyDeviceToDevice, main_stream));
332+
if (src0->type == GGML_TYPE_F32) {
333+
ggml_cpy_flt_cuda<float, float> (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs_d, graph_cpynode_index);
334+
} else {
335+
CUDA_CHECK(cudaMemcpyAsync(src1_ddc, src0_ddc, ggml_nbytes(src0), cudaMemcpyDeviceToDevice, main_stream));
336+
}
333337
}
334338
} else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32) {
335339
ggml_cpy_flt_cuda<float, float> (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs_d, graph_cpynode_index);
@@ -400,7 +404,13 @@ void ggml_cuda_dup(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
400404

401405
void* ggml_cuda_cpy_fn(const ggml_tensor * src0, ggml_tensor * src1) {
402406
if (src0->type == src1->type && ggml_is_contiguous(src0) && ggml_is_contiguous(src1)) {
403-
return nullptr;
407+
// Prioritize CUDA graph compatibility over direct memory copy optimization.
408+
// Using copy kernels here maintains graph indirection support, preventing performance regression from disabled CUDA graphs.
409+
if (src0->type == GGML_TYPE_F32) {
410+
return (void*) cpy_flt<cpy_1_flt<float, float>>;
411+
} else {
412+
return nullptr;
413+
}
404414
} else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32) {
405415
return (void*) cpy_flt<cpy_1_flt<float, float>>;
406416
} else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_BF16) {

ggml/src/ggml-cuda/ggml-cuda.cu

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2647,6 +2647,8 @@ static bool check_node_graph_compatibility_and_refresh_copy_ops(ggml_backend_cud
26472647
const std::string ffn_moe_gate_bias_prefix = "ffn_moe_gate_biased";
26482648
const std::string ffn_moe_up_bias_prefix = "ffn_moe_up_biased";
26492649
const std::string ffn_moe_down_bias_prefix = "ffn_moe_down_biased";
2650+
const std::string nemotron_h_block_out_prefix = "nemotron_h_block_out";
2651+
const std::string mamba2_y_add_d_prefix = "mamba2_y_add_d";
26502652

26512653
for (int i = 0; i < cgraph->n_nodes; i++) {
26522654
ggml_tensor * node = cgraph->nodes[i];
@@ -2675,7 +2677,9 @@ static bool check_node_graph_compatibility_and_refresh_copy_ops(ggml_backend_cud
26752677
(node->src[1] ? node->src[1]->name != gemma3n_per_layer_proj_src1_name : true) &&
26762678
strncmp(node->name, ffn_moe_gate_bias_prefix.c_str(), ffn_moe_gate_bias_prefix.size()) != 0 &&
26772679
strncmp(node->name, ffn_moe_up_bias_prefix.c_str(), ffn_moe_up_bias_prefix.size()) != 0 &&
2678-
strncmp(node->name, ffn_moe_down_bias_prefix.c_str(), ffn_moe_down_bias_prefix.size()) != 0) {
2680+
strncmp(node->name, ffn_moe_down_bias_prefix.c_str(), ffn_moe_down_bias_prefix.size()) != 0 &&
2681+
strncmp(node->name, nemotron_h_block_out_prefix.c_str(), nemotron_h_block_out_prefix.size()) != 0 &&
2682+
strncmp(node->name, mamba2_y_add_d_prefix.c_str(), mamba2_y_add_d_prefix.size()) != 0) {
26792683
// disable CUDA graphs for batch size > 1 for now while excluding the matrix-matrix addition as part of Gemma3n's `project_per_layer_input` operation
26802684
// by means of matching node names. See
26812685
// https://github.com/ggml-org/llama.cpp/blob/f9a31eea06a859e34cecb88b4d020c7f03d86cc4/src/llama-model.cpp#L10199-L10241 and

src/llama-model.cpp

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11933,6 +11933,7 @@ struct llm_graph_context_mamba : public llm_graph_context {
1193311933
// TODO: skip computing output earlier for unused tokens
1193411934

1193511935
y = ggml_add(ctx0, y, ggml_mul(ctx0, x, model.layers[il].ssm_d));
11936+
cb(y, "mamba2_y_add_d", il);
1193611937
y = ggml_swiglu_split(ctx0, ggml_cont(ctx0, z), y);
1193711938

1193811939
// grouped RMS norm
@@ -14887,6 +14888,7 @@ struct llm_build_nemotron_h : public llm_graph_context_mamba {
1488714888
ggml_tensor * inpL;
1488814889

1488914890
inpL = build_inp_embd(model.tok_embd);
14891+
ggml_build_forward_expand(gf, inpL);
1489014892

1489114893
auto * inp = build_inp_mem_hybrid();
1489214894

@@ -14918,7 +14920,7 @@ struct llm_build_nemotron_h : public llm_graph_context_mamba {
1491814920

1491914921
// add residual
1492014922
cur = ggml_add(ctx0, cur, inpSA);
14921-
cb(cur, "block_out", il);
14923+
cb(cur, "nemotron_h_block_out", il);
1492214924

1492314925
// input for next layer
1492414926
inpL = cur;

0 commit comments

Comments
 (0)