Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add LoRA support #820

Merged
merged 15 commits into from
Apr 17, 2023
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
Support more layer types, fix memory and generation issues
  • Loading branch information
slaren committed Apr 16, 2023
commit c45868ba9f5e358b42e8957a7c025efd6139ad7a
47 changes: 25 additions & 22 deletions convert-lora-to-ggml.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,18 +44,18 @@ class QuantizedDataType:
}

HF_SUBLAYER_TO_GGML = {
"self_attn.q_proj": "attention.wq.weight",
"self_attn.k_proj": "attention.wk.weight",
"self_attn.v_proj": "attention.wv.weight",
"self_attn.o_proj": "attention.wo.weight",
# "embed_tokens.weight": "tok_embeddings.weight",
# "norm.weight": "norm.weight",
# "lm_head.weight": "output.weight",
# "mlp.gate_proj": "feed_forward.w1.weight",
# "mlp.down_proj": "feed_forward.w2.weight",
# "mlp.up_proj": "feed_forward.w3.weight",
# "input_layernorm": "attention_norm.weight",
# "post_attention_layernorm": "ffn_norm.weight",
"self_attn.q_proj": "attention.wq",
"self_attn.k_proj": "attention.wk",
"self_attn.v_proj": "attention.wv",
"self_attn.o_proj": "attention.wo",
"mlp.gate_proj": "feed_forward.w1",
"mlp.down_proj": "feed_forward.w2",
"mlp.up_proj": "feed_forward.w3",
"input_layernorm": "attention_norm",
"post_attention_layernorm": "ffn_norm",
# "norm": "norm",
# "embed_tokens": "tok_embeddings",
# "lm_head": "output",
}


Expand All @@ -71,7 +71,9 @@ def translate_tensor_name(t):
print(f"Error: unrecognized sub-layer {sub_layer} in tensor {t}")
sys.exit(1)

output_string = f"layers.{nn}.{HF_SUBLAYER_TO_GGML[sub_layer]}.lora{lora_type}"
output_string = (
f"layers.{nn}.{HF_SUBLAYER_TO_GGML[sub_layer]}.weight.lora{lora_type}"
)
return output_string
else:
print(f"Error: unrecognized tensor {t}")
Expand Down Expand Up @@ -138,16 +140,17 @@ def write_tensor_header(self, name: str, shape: Sequence[int], data_type: 1) ->

write_file_header(fout, params)
for k, v in model.items():
# since ggml doesn't always support other types for the second operand,
# the tensors are always converted and exported as f32
v = v.float()
if k.endswith("lora_A.weight"):
if v.dtype != torch.float16 and v.dtype != torch.float32:
v = v.float()
v = v.T
else:
v = v.float()

t = v.numpy()
if "lora_A" in k:
t = t.T
print(
f"{k} => {translate_tensor_name(k)} {t.shape} {t.dtype} {t.nbytes/1024/1024:.2f}MB"
)
write_tensor_header(fout, translate_tensor_name(k), t.shape, t.dtype)
tname = translate_tensor_name(k)
print(f"{k} => {tname} {t.shape} {t.dtype} {t.nbytes/1024/1024:.2f}MB")
write_tensor_header(fout, tname, t.shape, t.dtype)
t.tofile(fout)

print(f"Converted {input_json} and {input_model} to {output_path}")
5 changes: 0 additions & 5 deletions ggml.c
Original file line number Diff line number Diff line change
Expand Up @@ -5955,11 +5955,6 @@ static void ggml_compute_forward_add_q_f32(
GGML_ASSERT(nb1 <= nb2);
GGML_ASSERT(nb2 <= nb3);

GGML_ASSERT(ne0 == ne01);
GGML_ASSERT(ne1 == ne11);
GGML_ASSERT(ne2 == ne02);
GGML_ASSERT(ne3 == ne03);

GGML_ASSERT(src0->type == GGML_TYPE_Q4_0 || src0->type == GGML_TYPE_Q4_1);
GGML_ASSERT(dst->type == src0->type);
GGML_ASSERT(src1->type == GGML_TYPE_F32);
Expand Down
20 changes: 11 additions & 9 deletions llama.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -617,6 +617,7 @@ struct llama_model_loader {
throw format("llama.cpp: tensor '%s' has wrong shape; expected %s, got %s",
name.c_str(), llama_format_tensor_shape(ne).c_str(), llama_format_tensor_shape(lt.ne).c_str());
}

return get_tensor_for(lt);
}

Expand Down Expand Up @@ -1799,7 +1800,8 @@ int llama_apply_lora_from_file(struct llama_context * ctx, const char * path_lor


// create a temporary ggml context to store the lora tensors
std::vector<uint8_t> buf(1024 * 1024 * 100);
// todo: calculate size from biggest possible tensor
std::vector<uint8_t> buf(1024ull * 1024ull * 1024ull);
struct ggml_init_params params;
params.mem_size = buf.size();
params.mem_buffer = buf.data();
Expand Down Expand Up @@ -1830,11 +1832,9 @@ int llama_apply_lora_from_file(struct llama_context * ctx, const char * path_lor
break;
}

int32_t nelements = 1;
int32_t ne[2] = { 1, 1 };
for (int i = 0; i < n_dims; ++i) {
fin.read(reinterpret_cast<char *>(&ne[i]), sizeof(ne[i]));
nelements *= ne[i];
}

std::string name(length, 0);
Expand Down Expand Up @@ -1903,24 +1903,26 @@ int llama_apply_lora_from_file(struct llama_context * ctx, const char * path_lor
}

// w = w + BA*s
ggml_tensor * BA = ggml_mul_mat(lora_ctx, loraB, loraA);
ggml_tensor * BA = ggml_mul_mat(lora_ctx, loraA, loraB);

if (scaling != 1.0f) {
ggml_tensor * scale_tensor = ggml_new_f32(lora_ctx, scaling);
BA = ggml_scale(lora_ctx, BA, scale_tensor);
}

//printf("%s: (B)(%d %d %d %d) x (A)(%d %d %d %d) => (BA)(%d %d %d %d) + (T)(%d %d %d %d)\n",
// base_name.c_str(),
// (int)loraB->ne[0], (int)loraB->ne[1], (int)loraB->ne[2], (int)loraB->ne[3],
// (int)loraA->ne[0], (int)loraA->ne[1], (int)loraA->ne[2], (int)loraA->ne[3],
// (int)BA->ne[0], (int)BA->ne[1], (int)BA->ne[2], (int)BA->ne[3],
// (int)tensor->ne[0], (int)tensor->ne[1], (int)tensor->ne[2], (int)tensor->ne[3]
//);
ggml_tensor * r = ggml_add_inplace(lora_ctx, tensor, BA);
//ggml_tensor * r = ggml_add(lora_ctx, tensor, BA);
//r = ggml_cpy(lora_ctx, r, tensor);

struct ggml_cgraph gf = ggml_build_forward(r);
gf.n_threads = n_threads;
ggml_graph_compute(lora_ctx, &gf);

// hack until ggml_cpy supports quantized tensors
// memcpy(tensor->data, r->data, ggml_nbytes(tensor));

// we won't need these tensors again, reset the context to save memory
ggml_free(lora_ctx);
lora_ctx = ggml_init(params);
Expand Down