78
78
79
79
// bump if necessary
80
80
#define LLAMA_MAX_LAYERS 512
81
- #define LLAMA_MAX_EXPERTS 160 // DeepSeekV2
81
+ #define LLAMA_MAX_EXPERTS 256 // DeepSeekV3
82
+
83
+ enum llama_expert_gating_func_type {
84
+ LLAMA_EXPERT_GATING_FUNC_TYPE_NONE = 0,
85
+ LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX = 1,
86
+ LLAMA_EXPERT_GATING_FUNC_TYPE_SIGMOID = 2,
87
+ };
82
88
83
89
//
84
90
// helpers
@@ -282,6 +288,8 @@ enum llm_kv {
282
288
LLM_KV_EXPERT_USED_COUNT,
283
289
LLM_KV_EXPERT_SHARED_COUNT,
284
290
LLM_KV_EXPERT_WEIGHTS_SCALE,
291
+ LLM_KV_EXPERT_WEIGHTS_NORM,
292
+ LLM_KV_EXPERT_GATING_FUNC,
285
293
LLM_KV_POOLING_TYPE,
286
294
LLM_KV_LOGIT_SCALE,
287
295
LLM_KV_DECODER_START_TOKEN_ID,
@@ -398,6 +406,8 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
398
406
{ LLM_KV_EXPERT_USED_COUNT, "%s.expert_used_count" },
399
407
{ LLM_KV_EXPERT_SHARED_COUNT, "%s.expert_shared_count" },
400
408
{ LLM_KV_EXPERT_WEIGHTS_SCALE, "%s.expert_weights_scale" },
409
+ { LLM_KV_EXPERT_WEIGHTS_NORM, "%s.expert_weights_norm" },
410
+ { LLM_KV_EXPERT_GATING_FUNC, "%s.expert_gating_func" },
401
411
{ LLM_KV_POOLING_TYPE, "%s.pooling_type" },
402
412
{ LLM_KV_LOGIT_SCALE, "%s.logit_scale" },
403
413
{ LLM_KV_DECODER_START_TOKEN_ID, "%s.decoder_start_token_id" },
@@ -534,6 +544,7 @@ enum llm_tensor {
534
544
LLM_TENSOR_FFN_DOWN_SHEXP,
535
545
LLM_TENSOR_FFN_GATE_SHEXP,
536
546
LLM_TENSOR_FFN_UP_SHEXP,
547
+ LLM_TENSOR_FFN_EXP_PROBS_B,
537
548
LLM_TENSOR_ATTN_Q_NORM,
538
549
LLM_TENSOR_ATTN_K_NORM,
539
550
LLM_TENSOR_LAYER_OUT_NORM,
@@ -1338,6 +1349,7 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
1338
1349
{ LLM_TENSOR_FFN_GATE_SHEXP, "blk.%d.ffn_gate_shexp" },
1339
1350
{ LLM_TENSOR_FFN_DOWN_SHEXP, "blk.%d.ffn_down_shexp" },
1340
1351
{ LLM_TENSOR_FFN_UP_SHEXP, "blk.%d.ffn_up_shexp" },
1352
+ { LLM_TENSOR_FFN_EXP_PROBS_B, "blk.%d.exp_probs_b" },
1341
1353
},
1342
1354
},
1343
1355
{
@@ -2442,6 +2454,7 @@ enum e_model {
2442
2454
MODEL_70B,
2443
2455
MODEL_236B,
2444
2456
MODEL_314B,
2457
+ MODEL_671B,
2445
2458
MODEL_SMALL,
2446
2459
MODEL_MEDIUM,
2447
2460
MODEL_LARGE,
@@ -2491,6 +2504,8 @@ struct llama_hparams {
2491
2504
uint32_t n_ff_shexp = 0;
2492
2505
uint32_t n_expert_shared = 0;
2493
2506
float expert_weights_scale = 0.0;
2507
+ bool expert_weights_norm = false;
2508
+ uint32_t expert_gating_func = LLAMA_EXPERT_GATING_FUNC_TYPE_NONE;
2494
2509
2495
2510
float f_norm_eps;
2496
2511
float f_norm_rms_eps;
@@ -2790,6 +2805,7 @@ struct llama_layer {
2790
2805
struct ggml_tensor * ffn_down_b; // b2
2791
2806
struct ggml_tensor * ffn_up_b; // b3
2792
2807
struct ggml_tensor * ffn_act;
2808
+ struct ggml_tensor * ffn_exp_probs_b;
2793
2809
2794
2810
// mamba proj
2795
2811
struct ggml_tensor * ssm_in;
@@ -5376,6 +5392,14 @@ static std::string llama_model_ftype_name(llama_ftype ftype) {
5376
5392
}
5377
5393
}
5378
5394
5395
+ static const char * llama_expert_gating_func_name(llama_expert_gating_func_type type) {
5396
+ switch (type) {
5397
+ case LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX: return "softmax";
5398
+ case LLAMA_EXPERT_GATING_FUNC_TYPE_SIGMOID: return "sigmoid";
5399
+ default: return "unknown";
5400
+ }
5401
+ }
5402
+
5379
5403
static const char * llama_model_type_name(e_model type) {
5380
5404
switch (type) {
5381
5405
case MODEL_14M: return "14M";
@@ -5427,6 +5451,7 @@ static const char * llama_model_type_name(e_model type) {
5427
5451
case MODEL_70B: return "70B";
5428
5452
case MODEL_236B: return "236B";
5429
5453
case MODEL_314B: return "314B";
5454
+ case MODEL_671B: return "671B";
5430
5455
case MODEL_SMALL: return "0.1B";
5431
5456
case MODEL_MEDIUM: return "0.4B";
5432
5457
case MODEL_LARGE: return "0.8B";
@@ -6109,6 +6134,13 @@ static void llm_load_hparams(
6109
6134
ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
6110
6135
ml.get_key(LLM_KV_EXPERT_SHARED_COUNT, hparams.n_expert_shared);
6111
6136
ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale);
6137
+ ml.get_key(LLM_KV_EXPERT_WEIGHTS_NORM, hparams.expert_weights_norm, false);
6138
+ ml.get_key(LLM_KV_EXPERT_GATING_FUNC, hparams.expert_gating_func, false);
6139
+ if (hparams.expert_gating_func == LLAMA_EXPERT_GATING_FUNC_TYPE_NONE) {
6140
+ // for compatibility with existing DeepSeek V2 and V2.5 GGUFs
6141
+ // that have no expert_gating_func model parameter set
6142
+ hparams.expert_gating_func = LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX;
6143
+ }
6112
6144
ml.get_key(LLM_KV_ROPE_SCALING_YARN_LOG_MUL, hparams.rope_yarn_log_mul);
6113
6145
6114
6146
switch (hparams.n_layer) {
@@ -6430,6 +6462,10 @@ static void llm_load_vocab(
6430
6462
tokenizer_pre == "deepseek-coder") {
6431
6463
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_CODER;
6432
6464
vocab.tokenizer_clean_spaces = false;
6465
+ } else if (
6466
+ tokenizer_pre == "deepseek-v3") {
6467
+ vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEEPSEEK3_LLM;
6468
+ vocab.tokenizer_clean_spaces = false;
6433
6469
} else if (
6434
6470
tokenizer_pre == "falcon") {
6435
6471
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_FALCON;
@@ -7103,6 +7139,8 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
7103
7139
LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
7104
7140
LLAMA_LOG_INFO("%s: n_expert_shared = %d\n", __func__, hparams.n_expert_shared);
7105
7141
LLAMA_LOG_INFO("%s: expert_weights_scale = %.1f\n", __func__, hparams.expert_weights_scale);
7142
+ LLAMA_LOG_INFO("%s: expert_weights_norm = %d\n", __func__, hparams.expert_weights_norm);
7143
+ LLAMA_LOG_INFO("%s: expert_gating_func = %s\n", __func__, llama_expert_gating_func_name((enum llama_expert_gating_func_type) hparams.expert_gating_func));
7106
7144
LLAMA_LOG_INFO("%s: rope_yarn_log_mul = %.4f\n", __func__, hparams.rope_yarn_log_mul);
7107
7145
}
7108
7146
@@ -7250,6 +7288,7 @@ static const std::map<llm_tensor, llm_tensor_info> llm_tensor_info_mapping = {
7250
7288
{LLM_TENSOR_FFN_DOWN_EXPS, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT_ID}},
7251
7289
{LLM_TENSOR_FFN_GATE_EXPS, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT_ID}},
7252
7290
{LLM_TENSOR_FFN_UP_EXPS, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT_ID}},
7291
+ {LLM_TENSOR_FFN_EXP_PROBS_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_ADD}},
7253
7292
// this tensor is loaded for T5, but never used
7254
7293
{LLM_TENSOR_DEC_CROSS_ATTN_REL_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_NONE}},
7255
7294
};
@@ -8961,6 +9000,7 @@ static bool llm_load_tensors(
8961
9000
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
8962
9001
} else {
8963
9002
layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
9003
+ layer.ffn_exp_probs_b = create_tensor(tn(LLM_TENSOR_FFN_EXP_PROBS_B, "bias", i), {n_expert}, llama_model_loader::TENSOR_NOT_REQUIRED);
8964
9004
8965
9005
if (n_expert == 0) {
8966
9006
throw std::runtime_error("n_expert must be > 0");
@@ -9831,12 +9871,14 @@ static struct ggml_tensor * llm_build_moe_ffn(
9831
9871
struct ggml_tensor * up_exps,
9832
9872
struct ggml_tensor * gate_exps,
9833
9873
struct ggml_tensor * down_exps,
9874
+ struct ggml_tensor * exp_probs_b,
9834
9875
int64_t n_expert,
9835
9876
int64_t n_expert_used,
9836
9877
llm_ffn_op_type type_op,
9837
9878
bool norm_w,
9838
9879
bool scale_w,
9839
9880
float w_scale,
9881
+ llama_expert_gating_func_type gating_op,
9840
9882
const llm_build_cb & cb,
9841
9883
int il) {
9842
9884
int64_t n_embd = cur->ne[0];
@@ -9845,11 +9887,31 @@ static struct ggml_tensor * llm_build_moe_ffn(
9845
9887
ggml_tensor * logits = llm_build_lora_mm(lctx, ctx, gate_inp, cur); // [n_expert, n_tokens]
9846
9888
cb(logits, "ffn_moe_logits", il);
9847
9889
9848
- ggml_tensor * probs = ggml_soft_max(ctx, logits); // [n_expert, n_tokens]
9890
+ ggml_tensor * probs = nullptr;
9891
+ switch (gating_op) {
9892
+ case LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX:
9893
+ {
9894
+ probs = ggml_soft_max(ctx, logits); // [n_expert, n_tokens]
9895
+ } break;
9896
+ case LLAMA_EXPERT_GATING_FUNC_TYPE_SIGMOID:
9897
+ {
9898
+ probs = ggml_sigmoid(ctx, logits); // [n_expert, n_tokens]
9899
+ } break;
9900
+ default:
9901
+ GGML_ABORT("fatal error");
9902
+ }
9849
9903
cb(probs, "ffn_moe_probs", il);
9850
9904
9905
+ // add experts selection bias - introduced in DeepSeek V3
9906
+ // leave probs unbiased as it's later used to get expert weights
9907
+ ggml_tensor * selection_probs = probs;
9908
+ if (exp_probs_b != nullptr) {
9909
+ selection_probs = ggml_add(ctx, probs, exp_probs_b);
9910
+ cb(selection_probs, "ffn_moe_probs_biased", il);
9911
+ }
9912
+
9851
9913
// select experts
9852
- ggml_tensor * selected_experts = ggml_top_k(ctx, probs , n_expert_used); // [n_expert_used, n_tokens]
9914
+ ggml_tensor * selected_experts = ggml_top_k(ctx, selection_probs , n_expert_used); // [n_expert_used, n_tokens]
9853
9915
cb(selected_experts->src[0], "ffn_moe_argsort", il);
9854
9916
cb(selected_experts, "ffn_moe_topk", il);
9855
9917
@@ -10970,9 +11032,11 @@ struct llm_build_context {
10970
11032
model.layers[il].ffn_up_exps,
10971
11033
model.layers[il].ffn_gate_exps,
10972
11034
model.layers[il].ffn_down_exps,
11035
+ nullptr,
10973
11036
n_expert, n_expert_used,
10974
11037
LLM_FFN_SILU, true,
10975
11038
false, 0.0,
11039
+ LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
10976
11040
cb, il);
10977
11041
cb(cur, "ffn_moe_out", il);
10978
11042
}
@@ -11461,9 +11525,11 @@ struct llm_build_context {
11461
11525
model.layers[il].ffn_up_exps,
11462
11526
model.layers[il].ffn_gate_exps,
11463
11527
model.layers[il].ffn_down_exps,
11528
+ nullptr,
11464
11529
n_expert, n_expert_used,
11465
11530
LLM_FFN_GELU, true,
11466
11531
false, 0.0,
11532
+ LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
11467
11533
cb, il);
11468
11534
cb(cur, "ffn_moe_out", il);
11469
11535
@@ -11602,9 +11668,11 @@ struct llm_build_context {
11602
11668
model.layers[il].ffn_up_exps,
11603
11669
model.layers[il].ffn_gate_exps,
11604
11670
model.layers[il].ffn_down_exps,
11671
+ nullptr,
11605
11672
n_expert, n_expert_used,
11606
11673
LLM_FFN_SILU, true,
11607
11674
false, 0.0,
11675
+ LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
11608
11676
cb, il);
11609
11677
cb(cur, "ffn_moe_out", il);
11610
11678
@@ -12732,9 +12800,11 @@ struct llm_build_context {
12732
12800
model.layers[il].ffn_up_exps,
12733
12801
model.layers[il].ffn_gate_exps,
12734
12802
model.layers[il].ffn_down_exps,
12803
+ nullptr,
12735
12804
n_expert, n_expert_used,
12736
12805
LLM_FFN_SILU, false,
12737
12806
false, 0.0,
12807
+ LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
12738
12808
cb, il);
12739
12809
cb(cur, "ffn_moe_out", il);
12740
12810
@@ -14726,9 +14796,11 @@ struct llm_build_context {
14726
14796
model.layers[il].ffn_up_exps,
14727
14797
model.layers[il].ffn_gate_exps,
14728
14798
model.layers[il].ffn_down_exps,
14799
+ nullptr,
14729
14800
n_expert, n_expert_used,
14730
14801
LLM_FFN_SILU, false,
14731
14802
false, 0.0,
14803
+ LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
14732
14804
cb, il);
14733
14805
cb(cur, "ffn_moe_out", il);
14734
14806
@@ -15123,9 +15195,11 @@ struct llm_build_context {
15123
15195
model.layers[il].ffn_up_exps,
15124
15196
model.layers[il].ffn_gate_exps,
15125
15197
model.layers[il].ffn_down_exps,
15198
+ nullptr,
15126
15199
n_expert, n_expert_used,
15127
15200
LLM_FFN_SILU, true,
15128
15201
false, 0.0,
15202
+ LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
15129
15203
cb, il);
15130
15204
cb(cur, "ffn_moe_out", il);
15131
15205
@@ -15338,9 +15412,11 @@ struct llm_build_context {
15338
15412
model.layers[il].ffn_up_exps,
15339
15413
model.layers[il].ffn_gate_exps,
15340
15414
model.layers[il].ffn_down_exps,
15415
+ model.layers[il].ffn_exp_probs_b,
15341
15416
n_expert, n_expert_used,
15342
- LLM_FFN_SILU, false,
15343
- true, hparams.expert_weights_scale,
15417
+ LLM_FFN_SILU, hparams.expert_weights_norm,
15418
+ false, 0.0,
15419
+ (enum llama_expert_gating_func_type) hparams.expert_gating_func,
15344
15420
cb, il);
15345
15421
cb(moe_out, "ffn_moe_out", il);
15346
15422
0 commit comments