Skip to content

Commit 36d63da

Browse files
committed
https://github.com/ggml-org/llama.cpp/commit/9394bbd484f802ce80d2858033583af3ef700d25
1 parent 38dae02 commit 36d63da

File tree

4 files changed

+95
-5
lines changed

4 files changed

+95
-5
lines changed

include/llama.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -105,6 +105,7 @@ extern "C" {
105105
LLAMA_VOCAB_PRE_TYPE_EXAONE = 25,
106106
LLAMA_VOCAB_PRE_TYPE_CHAMELEON = 26,
107107
LLAMA_VOCAB_PRE_TYPE_MINERVA = 27,
108+
LLAMA_VOCAB_PRE_TYPE_DEEPSEEK3_LLM = 28,
108109
};
109110

110111
enum llama_rope_type {

src/llama-vocab.cpp

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -396,6 +396,13 @@ struct llm_tokenizer_bpe : llm_tokenizer {
396396
"\\p{N}+",
397397
};
398398
break;
399+
case LLAMA_VOCAB_PRE_TYPE_DEEPSEEK3_LLM:
400+
regex_exprs = {
401+
"\\p{N}{1,3}",
402+
"[一-龥぀-ゟ゠-ヿ]+",
403+
"[!\"#$%&'()*+,\\-./:;<=>?@\\[\\\\\\]^_`{|}~][A-Za-z]+|[^\r\n\\p{L}\\p{P}\\p{S}]?[\\p{L}\\p{M}]+| ?[\\p{P}\\p{S}]+[\r\n]*|\\s*[\r\n]+|\\s+(?!\\S)|\\s+",
404+
};
405+
break;
399406
case LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_CODER:
400407
regex_exprs = {
401408
"[\r\n]",

src/llama.cpp

Lines changed: 81 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -78,7 +78,13 @@
7878

7979
// bump if necessary
8080
#define LLAMA_MAX_LAYERS 512
81-
#define LLAMA_MAX_EXPERTS 160 // DeepSeekV2
81+
#define LLAMA_MAX_EXPERTS 256 // DeepSeekV3
82+
83+
enum llama_expert_gating_func_type {
84+
LLAMA_EXPERT_GATING_FUNC_TYPE_NONE = 0,
85+
LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX = 1,
86+
LLAMA_EXPERT_GATING_FUNC_TYPE_SIGMOID = 2,
87+
};
8288

8389
//
8490
// helpers
@@ -282,6 +288,8 @@ enum llm_kv {
282288
LLM_KV_EXPERT_USED_COUNT,
283289
LLM_KV_EXPERT_SHARED_COUNT,
284290
LLM_KV_EXPERT_WEIGHTS_SCALE,
291+
LLM_KV_EXPERT_WEIGHTS_NORM,
292+
LLM_KV_EXPERT_GATING_FUNC,
285293
LLM_KV_POOLING_TYPE,
286294
LLM_KV_LOGIT_SCALE,
287295
LLM_KV_DECODER_START_TOKEN_ID,
@@ -398,6 +406,8 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
398406
{ LLM_KV_EXPERT_USED_COUNT, "%s.expert_used_count" },
399407
{ LLM_KV_EXPERT_SHARED_COUNT, "%s.expert_shared_count" },
400408
{ LLM_KV_EXPERT_WEIGHTS_SCALE, "%s.expert_weights_scale" },
409+
{ LLM_KV_EXPERT_WEIGHTS_NORM, "%s.expert_weights_norm" },
410+
{ LLM_KV_EXPERT_GATING_FUNC, "%s.expert_gating_func" },
401411
{ LLM_KV_POOLING_TYPE, "%s.pooling_type" },
402412
{ LLM_KV_LOGIT_SCALE, "%s.logit_scale" },
403413
{ LLM_KV_DECODER_START_TOKEN_ID, "%s.decoder_start_token_id" },
@@ -534,6 +544,7 @@ enum llm_tensor {
534544
LLM_TENSOR_FFN_DOWN_SHEXP,
535545
LLM_TENSOR_FFN_GATE_SHEXP,
536546
LLM_TENSOR_FFN_UP_SHEXP,
547+
LLM_TENSOR_FFN_EXP_PROBS_B,
537548
LLM_TENSOR_ATTN_Q_NORM,
538549
LLM_TENSOR_ATTN_K_NORM,
539550
LLM_TENSOR_LAYER_OUT_NORM,
@@ -1338,6 +1349,7 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
13381349
{ LLM_TENSOR_FFN_GATE_SHEXP, "blk.%d.ffn_gate_shexp" },
13391350
{ LLM_TENSOR_FFN_DOWN_SHEXP, "blk.%d.ffn_down_shexp" },
13401351
{ LLM_TENSOR_FFN_UP_SHEXP, "blk.%d.ffn_up_shexp" },
1352+
{ LLM_TENSOR_FFN_EXP_PROBS_B, "blk.%d.exp_probs_b" },
13411353
},
13421354
},
13431355
{
@@ -2442,6 +2454,7 @@ enum e_model {
24422454
MODEL_70B,
24432455
MODEL_236B,
24442456
MODEL_314B,
2457+
MODEL_671B,
24452458
MODEL_SMALL,
24462459
MODEL_MEDIUM,
24472460
MODEL_LARGE,
@@ -2491,6 +2504,8 @@ struct llama_hparams {
24912504
uint32_t n_ff_shexp = 0;
24922505
uint32_t n_expert_shared = 0;
24932506
float expert_weights_scale = 0.0;
2507+
bool expert_weights_norm = false;
2508+
uint32_t expert_gating_func = LLAMA_EXPERT_GATING_FUNC_TYPE_NONE;
24942509

24952510
float f_norm_eps;
24962511
float f_norm_rms_eps;
@@ -2790,6 +2805,7 @@ struct llama_layer {
27902805
struct ggml_tensor * ffn_down_b; // b2
27912806
struct ggml_tensor * ffn_up_b; // b3
27922807
struct ggml_tensor * ffn_act;
2808+
struct ggml_tensor * ffn_exp_probs_b;
27932809

27942810
// mamba proj
27952811
struct ggml_tensor * ssm_in;
@@ -5376,6 +5392,14 @@ static std::string llama_model_ftype_name(llama_ftype ftype) {
53765392
}
53775393
}
53785394

5395+
static const char * llama_expert_gating_func_name(llama_expert_gating_func_type type) {
5396+
switch (type) {
5397+
case LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX: return "softmax";
5398+
case LLAMA_EXPERT_GATING_FUNC_TYPE_SIGMOID: return "sigmoid";
5399+
default: return "unknown";
5400+
}
5401+
}
5402+
53795403
static const char * llama_model_type_name(e_model type) {
53805404
switch (type) {
53815405
case MODEL_14M: return "14M";
@@ -5427,6 +5451,7 @@ static const char * llama_model_type_name(e_model type) {
54275451
case MODEL_70B: return "70B";
54285452
case MODEL_236B: return "236B";
54295453
case MODEL_314B: return "314B";
5454+
case MODEL_671B: return "671B";
54305455
case MODEL_SMALL: return "0.1B";
54315456
case MODEL_MEDIUM: return "0.4B";
54325457
case MODEL_LARGE: return "0.8B";
@@ -6109,6 +6134,13 @@ static void llm_load_hparams(
61096134
ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
61106135
ml.get_key(LLM_KV_EXPERT_SHARED_COUNT, hparams.n_expert_shared);
61116136
ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale);
6137+
ml.get_key(LLM_KV_EXPERT_WEIGHTS_NORM, hparams.expert_weights_norm, false);
6138+
ml.get_key(LLM_KV_EXPERT_GATING_FUNC, hparams.expert_gating_func, false);
6139+
if (hparams.expert_gating_func == LLAMA_EXPERT_GATING_FUNC_TYPE_NONE) {
6140+
// for compatibility with existing DeepSeek V2 and V2.5 GGUFs
6141+
// that have no expert_gating_func model parameter set
6142+
hparams.expert_gating_func = LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX;
6143+
}
61126144
ml.get_key(LLM_KV_ROPE_SCALING_YARN_LOG_MUL, hparams.rope_yarn_log_mul);
61136145

61146146
switch (hparams.n_layer) {
@@ -6430,6 +6462,10 @@ static void llm_load_vocab(
64306462
tokenizer_pre == "deepseek-coder") {
64316463
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_CODER;
64326464
vocab.tokenizer_clean_spaces = false;
6465+
} else if (
6466+
tokenizer_pre == "deepseek-v3") {
6467+
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEEPSEEK3_LLM;
6468+
vocab.tokenizer_clean_spaces = false;
64336469
} else if (
64346470
tokenizer_pre == "falcon") {
64356471
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_FALCON;
@@ -7103,6 +7139,8 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
71037139
LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
71047140
LLAMA_LOG_INFO("%s: n_expert_shared = %d\n", __func__, hparams.n_expert_shared);
71057141
LLAMA_LOG_INFO("%s: expert_weights_scale = %.1f\n", __func__, hparams.expert_weights_scale);
7142+
LLAMA_LOG_INFO("%s: expert_weights_norm = %d\n", __func__, hparams.expert_weights_norm);
7143+
LLAMA_LOG_INFO("%s: expert_gating_func = %s\n", __func__, llama_expert_gating_func_name((enum llama_expert_gating_func_type) hparams.expert_gating_func));
71067144
LLAMA_LOG_INFO("%s: rope_yarn_log_mul = %.4f\n", __func__, hparams.rope_yarn_log_mul);
71077145
}
71087146

@@ -7250,6 +7288,7 @@ static const std::map<llm_tensor, llm_tensor_info> llm_tensor_info_mapping = {
72507288
{LLM_TENSOR_FFN_DOWN_EXPS, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT_ID}},
72517289
{LLM_TENSOR_FFN_GATE_EXPS, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT_ID}},
72527290
{LLM_TENSOR_FFN_UP_EXPS, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT_ID}},
7291+
{LLM_TENSOR_FFN_EXP_PROBS_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_ADD}},
72537292
// this tensor is loaded for T5, but never used
72547293
{LLM_TENSOR_DEC_CROSS_ATTN_REL_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_NONE}},
72557294
};
@@ -8961,6 +9000,7 @@ static bool llm_load_tensors(
89619000
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
89629001
} else {
89639002
layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
9003+
layer.ffn_exp_probs_b = create_tensor(tn(LLM_TENSOR_FFN_EXP_PROBS_B, "bias", i), {n_expert}, llama_model_loader::TENSOR_NOT_REQUIRED);
89649004

89659005
if (n_expert == 0) {
89669006
throw std::runtime_error("n_expert must be > 0");
@@ -9831,12 +9871,14 @@ static struct ggml_tensor * llm_build_moe_ffn(
98319871
struct ggml_tensor * up_exps,
98329872
struct ggml_tensor * gate_exps,
98339873
struct ggml_tensor * down_exps,
9874+
struct ggml_tensor * exp_probs_b,
98349875
int64_t n_expert,
98359876
int64_t n_expert_used,
98369877
llm_ffn_op_type type_op,
98379878
bool norm_w,
98389879
bool scale_w,
98399880
float w_scale,
9881+
llama_expert_gating_func_type gating_op,
98409882
const llm_build_cb & cb,
98419883
int il) {
98429884
int64_t n_embd = cur->ne[0];
@@ -9845,11 +9887,31 @@ static struct ggml_tensor * llm_build_moe_ffn(
98459887
ggml_tensor * logits = llm_build_lora_mm(lctx, ctx, gate_inp, cur); // [n_expert, n_tokens]
98469888
cb(logits, "ffn_moe_logits", il);
98479889

9848-
ggml_tensor * probs = ggml_soft_max(ctx, logits); // [n_expert, n_tokens]
9890+
ggml_tensor * probs = nullptr;
9891+
switch (gating_op) {
9892+
case LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX:
9893+
{
9894+
probs = ggml_soft_max(ctx, logits); // [n_expert, n_tokens]
9895+
} break;
9896+
case LLAMA_EXPERT_GATING_FUNC_TYPE_SIGMOID:
9897+
{
9898+
probs = ggml_sigmoid(ctx, logits); // [n_expert, n_tokens]
9899+
} break;
9900+
default:
9901+
GGML_ABORT("fatal error");
9902+
}
98499903
cb(probs, "ffn_moe_probs", il);
98509904

9905+
// add experts selection bias - introduced in DeepSeek V3
9906+
// leave probs unbiased as it's later used to get expert weights
9907+
ggml_tensor * selection_probs = probs;
9908+
if (exp_probs_b != nullptr) {
9909+
selection_probs = ggml_add(ctx, probs, exp_probs_b);
9910+
cb(selection_probs, "ffn_moe_probs_biased", il);
9911+
}
9912+
98519913
// select experts
9852-
ggml_tensor * selected_experts = ggml_top_k(ctx, probs, n_expert_used); // [n_expert_used, n_tokens]
9914+
ggml_tensor * selected_experts = ggml_top_k(ctx, selection_probs, n_expert_used); // [n_expert_used, n_tokens]
98539915
cb(selected_experts->src[0], "ffn_moe_argsort", il);
98549916
cb(selected_experts, "ffn_moe_topk", il);
98559917

@@ -10970,9 +11032,11 @@ struct llm_build_context {
1097011032
model.layers[il].ffn_up_exps,
1097111033
model.layers[il].ffn_gate_exps,
1097211034
model.layers[il].ffn_down_exps,
11035+
nullptr,
1097311036
n_expert, n_expert_used,
1097411037
LLM_FFN_SILU, true,
1097511038
false, 0.0,
11039+
LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
1097611040
cb, il);
1097711041
cb(cur, "ffn_moe_out", il);
1097811042
}
@@ -11461,9 +11525,11 @@ struct llm_build_context {
1146111525
model.layers[il].ffn_up_exps,
1146211526
model.layers[il].ffn_gate_exps,
1146311527
model.layers[il].ffn_down_exps,
11528+
nullptr,
1146411529
n_expert, n_expert_used,
1146511530
LLM_FFN_GELU, true,
1146611531
false, 0.0,
11532+
LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
1146711533
cb, il);
1146811534
cb(cur, "ffn_moe_out", il);
1146911535

@@ -11602,9 +11668,11 @@ struct llm_build_context {
1160211668
model.layers[il].ffn_up_exps,
1160311669
model.layers[il].ffn_gate_exps,
1160411670
model.layers[il].ffn_down_exps,
11671+
nullptr,
1160511672
n_expert, n_expert_used,
1160611673
LLM_FFN_SILU, true,
1160711674
false, 0.0,
11675+
LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
1160811676
cb, il);
1160911677
cb(cur, "ffn_moe_out", il);
1161011678

@@ -12732,9 +12800,11 @@ struct llm_build_context {
1273212800
model.layers[il].ffn_up_exps,
1273312801
model.layers[il].ffn_gate_exps,
1273412802
model.layers[il].ffn_down_exps,
12803+
nullptr,
1273512804
n_expert, n_expert_used,
1273612805
LLM_FFN_SILU, false,
1273712806
false, 0.0,
12807+
LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
1273812808
cb, il);
1273912809
cb(cur, "ffn_moe_out", il);
1274012810

@@ -14726,9 +14796,11 @@ struct llm_build_context {
1472614796
model.layers[il].ffn_up_exps,
1472714797
model.layers[il].ffn_gate_exps,
1472814798
model.layers[il].ffn_down_exps,
14799+
nullptr,
1472914800
n_expert, n_expert_used,
1473014801
LLM_FFN_SILU, false,
1473114802
false, 0.0,
14803+
LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
1473214804
cb, il);
1473314805
cb(cur, "ffn_moe_out", il);
1473414806

@@ -15123,9 +15195,11 @@ struct llm_build_context {
1512315195
model.layers[il].ffn_up_exps,
1512415196
model.layers[il].ffn_gate_exps,
1512515197
model.layers[il].ffn_down_exps,
15198+
nullptr,
1512615199
n_expert, n_expert_used,
1512715200
LLM_FFN_SILU, true,
1512815201
false, 0.0,
15202+
LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
1512915203
cb, il);
1513015204
cb(cur, "ffn_moe_out", il);
1513115205

@@ -15338,9 +15412,11 @@ struct llm_build_context {
1533815412
model.layers[il].ffn_up_exps,
1533915413
model.layers[il].ffn_gate_exps,
1534015414
model.layers[il].ffn_down_exps,
15415+
model.layers[il].ffn_exp_probs_b,
1534115416
n_expert, n_expert_used,
15342-
LLM_FFN_SILU, false,
15343-
true, hparams.expert_weights_scale,
15417+
LLM_FFN_SILU, hparams.expert_weights_norm,
15418+
false, 0.0,
15419+
(enum llama_expert_gating_func_type) hparams.expert_gating_func,
1534415420
cb, il);
1534515421
cb(moe_out, "ffn_moe_out", il);
1534615422

src/unicode.cpp

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -667,18 +667,24 @@ std::vector<std::string> unicode_regex_split(const std::string & text, const std
667667
{ "\\p{N}", codepoint_flags::NUMBER },
668668
{ "\\p{L}", codepoint_flags::LETTER },
669669
{ "\\p{P}", codepoint_flags::PUNCTUATION },
670+
{ "\\p{M}", codepoint_flags::ACCENT_MARK },
671+
{ "\\p{S}", codepoint_flags::SYMBOL },
670672
};
671673

672674
static const std::map<int, int> k_ucat_cpt = {
673675
{ codepoint_flags::NUMBER, 0xD1 },
674676
{ codepoint_flags::LETTER, 0xD2 },
675677
{ codepoint_flags::PUNCTUATION, 0xD3 },
678+
{ codepoint_flags::ACCENT_MARK, 0xD4 },
679+
{ codepoint_flags::SYMBOL, 0xD5 },
676680
};
677681

678682
static const std::map<int, std::string> k_ucat_map = {
679683
{ codepoint_flags::NUMBER, "\x30-\x39" }, // 0-9
680684
{ codepoint_flags::LETTER, "\x41-\x5A\x61-\x7A" }, // A-Za-z
681685
{ codepoint_flags::PUNCTUATION, "\x21-\x23\x25-\x2A\x2C-\x2F\x3A-\x3B\x3F-\x40\\\x5B-\\\x5D\x5F\\\x7B\\\x7D" }, // !-#%-*,-/:-;?-@\[-\]_\{\}
686+
{ codepoint_flags::ACCENT_MARK, "" }, // no sub-128 codepoints
687+
{ codepoint_flags::SYMBOL, "\\\x24\\\x2B\x3C-\x3E\x5E\x60\\\x7C" }, // $+<=>^`|
682688
};
683689

684690
// compute collapsed codepoints only if needed by at least one regex

0 commit comments

Comments
 (0)