totally DID NOT miss those!

pwilkin · pwilkin · commit 39bca8b41a56 · 2025-10-29T13:36:23.000+01:00
diff --git a/src/models/bailingmoe2.cpp b/src/models/bailingmoe2.cpp
@@ -75,21 +75,33 @@ llm_build_bailingmoe2::llm_build_bailingmoe2(const llama_model & model, const ll
         cb(cur, "ffn_norm", il);
 
         if (static_cast<uint32_t>(il) < hparams.n_layer_dense_lead) {
-            cur = build_ffn(cur, model.layers[il].ffn_up, NULL, NULL, model.layers[il].ffn_gate, NULL, NULL,
-                            model.layers[il].ffn_down, NULL, NULL, NULL, LLM_FFN_SILU, LLM_FFN_PAR, il);
+            cur = build_ffn(cur,
+                    model.layers[il].ffn_up, NULL, NULL,
+                    model.layers[il].ffn_gate, NULL, NULL,
+                    model.layers[il].ffn_down, NULL, NULL,
+                    NULL, LLM_FFN_SILU, LLM_FFN_PAR, il);
             cb(cur, "ffn_out", il);
         } else {
-            ggml_tensor * moe_out = build_moe_ffn(
-                cur, model.layers[il].ffn_gate_inp, model.layers[il].ffn_up_exps, model.layers[il].ffn_gate_exps,
-                model.layers[il].ffn_down_exps, model.layers[il].ffn_exp_probs_b, n_expert, n_expert_used, LLM_FFN_SILU,
-                hparams.expert_weights_norm, true, hparams.expert_weights_scale,
-                (llama_expert_gating_func_type) hparams.expert_gating_func, il);
+            ggml_tensor * moe_out = build_moe_ffn(cur,
+                model.layers[il].ffn_gate_inp,
+                model.layers[il].ffn_up_exps,
+                model.layers[il].ffn_gate_exps,
+                model.layers[il].ffn_down_exps,
+                model.layers[il].ffn_exp_probs_b,
+                n_expert, n_expert_used,
+                LLM_FFN_SILU, hparams.expert_weights_norm,
+                true, hparams.expert_weights_scale,
+                (llama_expert_gating_func_type) hparams.expert_gating_func,
+                il);
             cb(moe_out, "ffn_moe_out", il);
 
             {
                 ggml_tensor * ffn_shexp =
-                    build_ffn(cur, model.layers[il].ffn_up_shexp, NULL, NULL, model.layers[il].ffn_gate_shexp, NULL,
-                              NULL, model.layers[il].ffn_down_shexp, NULL, NULL, NULL, LLM_FFN_SILU, LLM_FFN_PAR, il);
+                    build_ffn(cur,
+                        model.layers[il].ffn_up_shexp, NULL, NULL,
+                        model.layers[il].ffn_gate_shexp, NULL, NULL,
+                        model.layers[il].ffn_down_shexp, NULL, NULL,
+                        NULL, LLM_FFN_SILU, LLM_FFN_PAR, il);
                 cb(ffn_shexp, "ffn_shexp", il);
 
                 cur = ggml_add(ctx0, moe_out, ffn_shexp);
diff --git a/src/models/deepseek2.cpp b/src/models/deepseek2.cpp
@@ -200,7 +200,8 @@ llm_build_deepseek2::llm_build_deepseek2(const llama_model & model, const llm_gr
             // FFN shared expert
             {
                 ggml_tensor * ffn_shexp =
-                    build_ffn(cur, model.layers[il].ffn_up_shexp, NULL, NULL,
+                    build_ffn(cur,
+                        model.layers[il].ffn_up_shexp, NULL, NULL,
                         model.layers[il].ffn_gate_shexp, NULL, NULL,
                         model.layers[il].ffn_down_shexp, NULL, NULL,
                         NULL, LLM_FFN_SILU, LLM_FFN_PAR, il);
diff --git a/src/models/grovemoe.cpp b/src/models/grovemoe.cpp
@@ -82,17 +82,34 @@ llm_build_grovemoe::llm_build_grovemoe(const llama_model & model, const llm_grap
         cb(probs, "ffn_moe_logits", il);
 
         ggml_tensor * moe_out =
-            build_moe_ffn(cur, nullptr, model.layers[il].ffn_up_exps, model.layers[il].ffn_gate_exps,
-                          model.layers[il].ffn_down_exps, nullptr, n_expert, n_expert_used, LLM_FFN_SILU, true, false,
-                          0.0, LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX, il, probs);
+            build_moe_ffn(cur,
+                nullptr,
+                model.layers[il].ffn_up_exps,
+                model.layers[il].ffn_gate_exps,
+                model.layers[il].ffn_down_exps,
+                nullptr,
+                n_expert, n_expert_used,
+                LLM_FFN_SILU, true,
+                false, 0.0,
+                LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
+                il,
+                probs);
         cb(moe_out, "ffn_moe_out", il);
         cur = moe_out;
 
         // TODO: Only do the expert selection and weights once
-        moe_out = build_moe_ffn(cur, nullptr, model.layers[il].ffn_up_chexps, model.layers[il].ffn_gate_chexps,
-                                model.layers[il].ffn_down_chexps, nullptr, n_chunk_expert,
-                                n_expert_used > n_chunk_expert ? n_chunk_expert : n_expert_used, LLM_FFN_SILU, true,
-                                false, 0.0, LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX, il, probs);
+        moe_out = build_moe_ffn(cur,
+                    nullptr,
+                    model.layers[il].ffn_up_chexps,
+                    model.layers[il].ffn_gate_chexps,
+                    model.layers[il].ffn_down_chexps,
+                    nullptr,
+                    n_chunk_expert, n_expert_used > n_chunk_expert ? n_chunk_expert : n_expert_used,
+                    LLM_FFN_SILU, true,
+                    false, 0.0,
+                    LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
+                    il,
+                    probs);
         cb(moe_out, "ffn_adj_moe_out", il);
 
         cur = ggml_add(ctx0, cur, ggml_scale(ctx0, moe_out, hparams.expert_group_scale));
diff --git a/src/models/lfm2.cpp b/src/models/lfm2.cpp
@@ -53,18 +53,22 @@ llm_build_lfm2::llm_build_lfm2(const llama_model & model, const llm_graph_params
 }
 
 ggml_tensor * llm_build_lfm2::build_moe_feed_forward(ggml_tensor * cur, int il) const {
-    return build_moe_ffn(cur, model.layers[il].ffn_gate_inp, model.layers[il].ffn_up_exps,
-                         model.layers[il].ffn_gate_exps, model.layers[il].ffn_down_exps,
-                         model.layers[il].ffn_exp_probs_b, n_expert, n_expert_used, LLM_FFN_SILU, true, false, 0.0,
-                         static_cast<llama_expert_gating_func_type>(hparams.expert_gating_func), il);
+    return build_moe_ffn(cur,
+                        model.layers[il].ffn_gate_inp, model.layers[il].ffn_up_exps,
+                        model.layers[il].ffn_gate_exps, model.layers[il].ffn_down_exps,
+                        model.layers[il].ffn_exp_probs_b, n_expert, n_expert_used, LLM_FFN_SILU, true, false, 0.0,
+                        static_cast<llama_expert_gating_func_type>(hparams.expert_gating_func), il);
 }
 
 ggml_tensor * llm_build_lfm2::build_dense_feed_forward(ggml_tensor * cur, int il) const {
     GGML_ASSERT(!model.layers[il].ffn_up_b);
     GGML_ASSERT(!model.layers[il].ffn_gate_b);
     GGML_ASSERT(!model.layers[il].ffn_down_b);
-    return build_ffn(cur, model.layers[il].ffn_up, NULL, NULL, model.layers[il].ffn_gate, NULL, NULL,
-                     model.layers[il].ffn_down, NULL, NULL, NULL, LLM_FFN_SILU, LLM_FFN_PAR, il);
+    return build_ffn(cur,
+        model.layers[il].ffn_up, NULL, NULL,
+        model.layers[il].ffn_gate, NULL, NULL,
+        model.layers[il].ffn_down, NULL, NULL,
+        NULL, LLM_FFN_SILU, LLM_FFN_PAR, il);
 }
 
 ggml_tensor * llm_build_lfm2::build_attn_block(ggml_tensor *             cur,
diff --git a/src/models/llada.cpp b/src/models/llada.cpp
@@ -69,8 +69,11 @@ llm_build_llada::llm_build_llada(const llama_model & model, const llm_graph_para
             cur = build_norm(ffn_inp, model.layers[il].ffn_norm, NULL, LLM_NORM_RMS, il);
             cb(cur, "ffn_norm", il);
 
-            cur = build_ffn(cur, model.layers[il].ffn_up, NULL, NULL, model.layers[il].ffn_gate, NULL, NULL,
-                            model.layers[il].ffn_down, NULL, NULL, NULL, LLM_FFN_SILU, LLM_FFN_PAR, il);
+            cur = build_ffn(cur,
+                    model.layers[il].ffn_up, NULL, NULL,
+                    model.layers[il].ffn_gate, NULL, NULL,
+                    model.layers[il].ffn_down, NULL, NULL,
+                    NULL, LLM_FFN_SILU, LLM_FFN_PAR, il);
             cb(cur, "ffn_out", il);
 
             cur = ggml_add(ctx0, cur, ffn_inp);