-
Notifications
You must be signed in to change notification settings - Fork 12.1k
Add support for Arcee AI's upcoming AFM model #14185
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
506d215
f3b1e0f
68fa44b
9730b40
aa3c988
b2638a2
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -599,6 +599,16 @@ void llama_model::load_hparams(llama_model_loader & ml) { | |
hparams.use_kq_norm = false; | ||
} | ||
} break; | ||
case LLM_ARCH_ARCEE: | ||
{ | ||
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); | ||
|
||
// Arcee uses the same structure as Llama | ||
switch (hparams.n_layer) { | ||
case 36: type = LLM_TYPE_4B; break; | ||
default: type = LLM_TYPE_UNKNOWN; | ||
} | ||
} break; | ||
case LLM_ARCH_DECI: | ||
{ | ||
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); | ||
|
@@ -4190,6 +4200,37 @@ bool llama_model::load_tensors(llama_model_loader & ml) { | |
} | ||
} | ||
} break; | ||
case LLM_ARCH_ARCEE: | ||
{ | ||
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0); | ||
|
||
// output | ||
output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0); | ||
output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED); | ||
|
||
// if output is NULL, init from the input tok embed | ||
if (output == NULL) { | ||
output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED); | ||
} | ||
|
||
for (int i = 0; i < n_layer; ++i) { | ||
auto & layer = layers[i]; | ||
|
||
layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0); | ||
|
||
layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0); | ||
layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0); | ||
layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0); | ||
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0); | ||
|
||
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0); | ||
|
||
layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0)); | ||
|
||
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0); | ||
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0); | ||
} | ||
} break; | ||
default: | ||
throw std::runtime_error("unknown architecture"); | ||
} | ||
|
@@ -13411,6 +13452,141 @@ struct llm_build_dots1 : public llm_graph_context { | |
} | ||
}; | ||
|
||
struct llm_build_arcee : public llm_graph_context { | ||
llm_build_arcee(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) { | ||
const int64_t n_embd_head = hparams.n_embd_head_v; | ||
|
||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); | ||
GGML_ASSERT(n_embd_head == hparams.n_rot); | ||
|
||
ggml_tensor * cur; | ||
ggml_tensor * inpL; | ||
|
||
inpL = build_inp_embd(model.tok_embd); | ||
|
||
// inp_pos - contains the positions | ||
ggml_tensor * inp_pos = build_inp_pos(); | ||
|
||
auto * inp_attn = build_attn_inp_kv_unified(); | ||
|
||
const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale; | ||
|
||
for (int il = 0; il < n_layer; ++il) { | ||
ggml_tensor * inpSA = inpL; | ||
|
||
// norm | ||
cur = build_norm(inpL, | ||
model.layers[il].attn_norm, NULL, | ||
LLM_NORM_RMS, il); | ||
cb(cur, "attn_norm", il); | ||
|
||
// self-attention | ||
{ | ||
// rope freq factors for llama3; may return nullptr for llama2 and other models | ||
ggml_tensor * rope_factors = model.get_rope_factors(cparams, il); | ||
|
||
// compute Q and K and RoPE them | ||
ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur); | ||
cb(Qcur, "Qcur", il); | ||
if (model.layers[il].bq) { | ||
Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq); | ||
cb(Qcur, "Qcur", il); | ||
} | ||
|
||
ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur); | ||
cb(Kcur, "Kcur", il); | ||
if (model.layers[il].bk) { | ||
Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk); | ||
cb(Kcur, "Kcur", il); | ||
} | ||
|
||
ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur); | ||
cb(Vcur, "Vcur", il); | ||
if (model.layers[il].bv) { | ||
Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv); | ||
cb(Vcur, "Vcur", il); | ||
} | ||
|
||
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); | ||
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens); | ||
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens); | ||
|
||
Qcur = ggml_rope_ext( | ||
ctx0, Qcur, inp_pos, rope_factors, | ||
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, | ||
ext_factor, attn_factor, beta_fast, beta_slow | ||
); | ||
|
||
Kcur = ggml_rope_ext( | ||
ctx0, Kcur, inp_pos, rope_factors, | ||
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, | ||
ext_factor, attn_factor, beta_fast, beta_slow | ||
); | ||
|
||
cb(Qcur, "Qcur", il); | ||
cb(Kcur, "Kcur", il); | ||
cb(Vcur, "Vcur", il); | ||
|
||
cur = build_attn(inp_attn, gf, | ||
model.layers[il].wo, model.layers[il].bo, | ||
Qcur, Kcur, Vcur, nullptr, nullptr, kq_scale, il); | ||
cb(cur, "attn_out", il); | ||
} | ||
|
||
if (il == n_layer - 1) { | ||
// skip computing output for unused tokens | ||
ggml_tensor * inp_out_ids = build_inp_out_ids(); | ||
cur = ggml_get_rows(ctx0, cur, inp_out_ids); | ||
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids); | ||
} | ||
|
||
ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA); | ||
cb(ffn_inp, "ffn_inp", il); | ||
|
||
// feed-forward network | ||
// ARCEE uses relu^2 instead of silu | ||
cur = build_norm(ffn_inp, | ||
model.layers[il].ffn_norm, NULL, | ||
LLM_NORM_RMS, il); | ||
cb(cur, "ffn_norm", il); | ||
|
||
cur = build_ffn(cur, | ||
model.layers[il].ffn_up, NULL, NULL, | ||
NULL, NULL, NULL, | ||
model.layers[il].ffn_down, NULL, NULL, | ||
NULL, | ||
LLM_FFN_RELU_SQR, LLM_FFN_SEQ, il); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Seems like the only different from AFM and llama is only this activation function. Not sure if in the future, we can abstract out this activation definition per-model (maybe as a hparam or a variable inside There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It also lacks the FFN gate, but maybe could also be abstracted? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. if the gate is not present, its value will be There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. ah right that makes sense ! yeah definitely seems worth considering some extra abstraction here then There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. btw I'm just bring this up for further discussion. Feel free to merge the current PR without that |
||
cb(cur, "ffn_out", il); | ||
|
||
cur = ggml_add(ctx0, cur, ffn_inp); | ||
cb(cur, "ffn_out", il); | ||
|
||
cur = build_cvec(cur, il); | ||
cb(cur, "l_out", il); | ||
|
||
// input for next layer | ||
inpL = cur; | ||
} | ||
|
||
cur = inpL; | ||
|
||
cur = build_norm(cur, | ||
model.output_norm, NULL, | ||
LLM_NORM_RMS, -1); | ||
|
||
cb(cur, "result_norm", -1); | ||
res->t_embd = cur; | ||
|
||
// lm_head | ||
cur = build_lora_mm(model.output, cur); | ||
|
||
cb(cur, "result_output", -1); | ||
res->t_logits = cur; | ||
|
||
ggml_build_forward_expand(gf, cur); | ||
} | ||
}; | ||
|
||
llama_memory_i * llama_model::create_memory(const llama_memory_params & params, llama_cparams & cparams) const { | ||
llama_memory_i * res; | ||
|
||
|
@@ -13753,6 +13929,10 @@ llm_graph_result_ptr llama_model::build_graph( | |
{ | ||
llm = std::make_unique<llm_build_dots1>(*this, params, gf); | ||
} break; | ||
case LLM_ARCH_ARCEE: | ||
{ | ||
llm = std::make_unique<llm_build_arcee>(*this, params, gf); | ||
} break; | ||
default: | ||
GGML_ABORT("fatal error"); | ||
} | ||
|
@@ -13902,6 +14082,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) { | |
case LLM_ARCH_GRANITE_MOE: | ||
case LLM_ARCH_CHAMELEON: | ||
case LLM_ARCH_BAILINGMOE: | ||
case LLM_ARCH_ARCEE: | ||
return LLAMA_ROPE_TYPE_NORM; | ||
|
||
// the pairs of head values are offset by n_rot/2 | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Either this shouldn't have been added, or you forgot to add the new hash.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
addressed in #14207