Skip to content

Commit

Permalink
falcon : support non-40B models
Browse files Browse the repository at this point in the history
  • Loading branch information
ggerganov committed Aug 22, 2023
1 parent 3c7c325 commit 2d58444
Show file tree
Hide file tree
Showing 2 changed files with 26 additions and 21 deletions.
7 changes: 5 additions & 2 deletions convert-falcon-hf-to-gguf.py
Original file line number Diff line number Diff line change
Expand Up @@ -101,7 +101,10 @@ def count_model_parts(dir_model: str) -> int:
gguf_writer.add_feed_forward_length(4 * hparams["hidden_size"])
gguf_writer.add_block_count(block_count)
gguf_writer.add_head_count(hparams["n_head"])
if "n_head_kv" in hparams: gguf_writer.add_head_count_kv(hparams["n_head_kv"])
if "n_head_kv" in hparams:
gguf_writer.add_head_count_kv(hparams["n_head_kv"])
else:
gguf_writer.add_head_count_kv(1)
gguf_writer.add_layer_norm_eps(hparams["layer_norm_epsilon"])

# TOKENIZATION
Expand Down Expand Up @@ -201,7 +204,7 @@ def count_model_parts(dir_model: str) -> int:

# params for qkv transform
n_head = hparams["n_head"]
n_head_kv = hparams["n_head_kv"] if "n_head_kv" in hparams else n_head
n_head_kv = hparams["n_head_kv"] if "n_head_kv" in hparams else 1

head_dim = hparams["hidden_size"] // n_head

Expand Down
40 changes: 21 additions & 19 deletions llama.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1859,10 +1859,13 @@ static void llm_load_tensors(
for (uint32_t i = 0; i < n_layer; ++i) {
auto & layer = model.layers[i];

layer.attn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, GGML_BACKEND_CPU);
layer.attn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, GGML_BACKEND_CPU);
layer.attn_norm_2 = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM_2, "weight", i), {n_embd}, GGML_BACKEND_CPU);
layer.attn_norm_2_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM_2, "bias", i), {n_embd}, GGML_BACKEND_CPU);
layer.attn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, GGML_BACKEND_CPU);
layer.attn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, GGML_BACKEND_CPU);

if (gguf_find_tensor(ml.ctx_gguf, tn(LLM_TENSOR_ATTN_NORM_2, "weight", i).c_str()) >= 0) {
layer.attn_norm_2 = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM_2, "weight", i), {n_embd}, GGML_BACKEND_CPU);
layer.attn_norm_2_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM_2, "bias", i), {n_embd}, GGML_BACKEND_CPU);
}

layer.wqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, GGML_BACKEND_CPU);
layer.wo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, GGML_BACKEND_CPU);
Expand Down Expand Up @@ -2421,29 +2424,28 @@ static struct ggml_cgraph * llm_build_falcon(

for (int il = 0; il < n_layer; ++il) {
struct ggml_tensor * cur;
struct ggml_tensor * layernorm_output;
struct ggml_tensor * attn_norm;

// self-attention
{
layernorm_output = ggml_norm(ctx0, inpL);
attn_norm = ggml_norm(ctx0, inpL);

layernorm_output = ggml_add(ctx0,
attn_norm = ggml_add(ctx0,
ggml_mul(ctx0,
ggml_repeat(ctx0, model.layers[il].attn_norm, layernorm_output),
layernorm_output),
ggml_repeat(ctx0, model.layers[il].attn_norm_b, layernorm_output));
ggml_repeat(ctx0, model.layers[il].attn_norm, attn_norm),
attn_norm),
ggml_repeat(ctx0, model.layers[il].attn_norm_b, attn_norm));

if ( hparams.n_head_kv == 8 ) { // Falcon-40B
if (hparams.n_head_kv == 8) { // Falcon-40B
cur = ggml_norm(ctx0, inpL);

cur = ggml_add(ctx0,
ggml_mul(ctx0,
ggml_repeat(ctx0, model.layers[il].attn_norm_2, cur),
cur),
ggml_repeat(ctx0, model.layers[il].attn_norm_2_b, cur));
}
else { // Falcon 7B
cur = layernorm_output;
} else { // Falcon 7B
cur = attn_norm;
}

// compute QKV
Expand Down Expand Up @@ -2563,8 +2565,8 @@ static struct ggml_cgraph * llm_build_falcon(
}
}

struct ggml_tensor* inpFF = layernorm_output;
struct ggml_tensor* attn_out = ggml_cpy(
struct ggml_tensor * inpFF = attn_norm;
struct ggml_tensor * attn_out = ggml_cpy(
ctx0, cur, ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N));

{
Expand Down Expand Up @@ -2607,7 +2609,7 @@ static struct ggml_cgraph * llama_build_graph(
const float * embd,
int n_tokens,
int n_past) {
const auto & model = lctx.model;
const auto & model = lctx.model;

struct ggml_cgraph * result = NULL;

Expand Down Expand Up @@ -2669,8 +2671,8 @@ static bool llama_eval_internal(

GGML_ASSERT(!!kv_self.ctx);

const int64_t n_embd = hparams.n_embd;
const int64_t n_vocab = hparams.n_vocab;
const int64_t n_embd = hparams.n_embd;
const int64_t n_vocab = hparams.n_vocab;

ggml_allocr_reset(lctx.alloc);

Expand Down

0 comments on commit 2d58444

Please sign in to comment.