Support more layer types, fix memory and generation issues

ggerganov · slaren · Apr 17, 2023 · Apr 6, 2023 · Apr 8, 2023 · Apr 8, 2023
commit c45868ba9f5e358b42e8957a7c025efd6139ad7a
diff --git a/convert-lora-to-ggml.py b/convert-lora-to-ggml.py
@@ -44,18 +44,18 @@ class QuantizedDataType:
 }
 
 HF_SUBLAYER_TO_GGML = {
-    "self_attn.q_proj": "attention.wq.weight",
-    "self_attn.k_proj": "attention.wk.weight",
-    "self_attn.v_proj": "attention.wv.weight",
-    "self_attn.o_proj": "attention.wo.weight",
-    # "embed_tokens.weight": "tok_embeddings.weight",
-    # "norm.weight": "norm.weight",
-    # "lm_head.weight": "output.weight",
-    # "mlp.gate_proj": "feed_forward.w1.weight",
-    # "mlp.down_proj": "feed_forward.w2.weight",
-    # "mlp.up_proj": "feed_forward.w3.weight",
-    # "input_layernorm": "attention_norm.weight",
-    # "post_attention_layernorm": "ffn_norm.weight",
+    "self_attn.q_proj": "attention.wq",
+    "self_attn.k_proj": "attention.wk",
+    "self_attn.v_proj": "attention.wv",
+    "self_attn.o_proj": "attention.wo",
+    "mlp.gate_proj": "feed_forward.w1",
+    "mlp.down_proj": "feed_forward.w2",
+    "mlp.up_proj": "feed_forward.w3",
+    "input_layernorm": "attention_norm",
+    "post_attention_layernorm": "ffn_norm",
+    # "norm": "norm",
+    # "embed_tokens": "tok_embeddings",
+    # "lm_head": "output",
 }
 
 
@@ -71,7 +71,9 @@ def translate_tensor_name(t):
             print(f"Error: unrecognized sub-layer {sub_layer} in tensor {t}")
             sys.exit(1)
 
-        output_string = f"layers.{nn}.{HF_SUBLAYER_TO_GGML[sub_layer]}.lora{lora_type}"
+        output_string = (
+            f"layers.{nn}.{HF_SUBLAYER_TO_GGML[sub_layer]}.weight.lora{lora_type}"
+        )
         return output_string
     else:
         print(f"Error: unrecognized tensor {t}")
@@ -138,16 +140,17 @@ def write_tensor_header(self, name: str, shape: Sequence[int], data_type: 1) ->
 
     write_file_header(fout, params)
     for k, v in model.items():
-        # since ggml doesn't always support other types for the second operand,
-        # the tensors are always converted and exported as f32
-        v = v.float()
+        if k.endswith("lora_A.weight"):
+            if v.dtype != torch.float16 and v.dtype != torch.float32:
+                v = v.float()
+            v = v.T
+        else:
+            v = v.float()
+
         t = v.numpy()
-        if "lora_A" in k:
-            t = t.T
-        print(
-            f"{k} => {translate_tensor_name(k)} {t.shape} {t.dtype} {t.nbytes/1024/1024:.2f}MB"
-        )
-        write_tensor_header(fout, translate_tensor_name(k), t.shape, t.dtype)
+        tname = translate_tensor_name(k)
+        print(f"{k} => {tname} {t.shape} {t.dtype} {t.nbytes/1024/1024:.2f}MB")
+        write_tensor_header(fout, tname, t.shape, t.dtype)
         t.tofile(fout)
 
 print(f"Converted {input_json} and {input_model} to {output_path}")
diff --git a/ggml.c b/ggml.c
@@ -5955,11 +5955,6 @@ static void ggml_compute_forward_add_q_f32(
     GGML_ASSERT(nb1 <= nb2);
     GGML_ASSERT(nb2 <= nb3);
 
-    GGML_ASSERT(ne0 == ne01);
-    GGML_ASSERT(ne1 == ne11);
-    GGML_ASSERT(ne2 == ne02);
-    GGML_ASSERT(ne3 == ne03);
-
     GGML_ASSERT(src0->type == GGML_TYPE_Q4_0 || src0->type == GGML_TYPE_Q4_1);
     GGML_ASSERT(dst->type == src0->type);
     GGML_ASSERT(src1->type == GGML_TYPE_F32);

diff --git a/llama.cpp b/llama.cpp
@@ -617,6 +617,7 @@ struct llama_model_loader {
             throw format("llama.cpp: tensor '%s' has wrong shape; expected %s, got %s",
                          name.c_str(), llama_format_tensor_shape(ne).c_str(), llama_format_tensor_shape(lt.ne).c_str());
         }
+
         return get_tensor_for(lt);
     }
 
@@ -1799,7 +1800,8 @@ int llama_apply_lora_from_file(struct llama_context * ctx, const char * path_lor
 
 
     // create a temporary ggml context to store the lora tensors
-    std::vector<uint8_t> buf(1024 * 1024 * 100);
+    // todo: calculate size from biggest possible tensor
+    std::vector<uint8_t> buf(1024ull * 1024ull * 1024ull);
     struct ggml_init_params params;
     params.mem_size   = buf.size();
     params.mem_buffer = buf.data();
@@ -1830,11 +1832,9 @@ int llama_apply_lora_from_file(struct llama_context * ctx, const char * path_lor
             break;
         }
 
-        int32_t nelements = 1;
         int32_t ne[2] = { 1, 1 };
         for (int i = 0; i < n_dims; ++i) {
             fin.read(reinterpret_cast<char *>(&ne[i]), sizeof(ne[i]));
-            nelements *= ne[i];
         }
 
         std::string name(length, 0);
@@ -1903,24 +1903,26 @@ int llama_apply_lora_from_file(struct llama_context * ctx, const char * path_lor
             }
 
             // w = w + BA*s
-            ggml_tensor * BA = ggml_mul_mat(lora_ctx, loraB, loraA);
+            ggml_tensor * BA = ggml_mul_mat(lora_ctx, loraA, loraB);
 
             if (scaling != 1.0f) {
                 ggml_tensor * scale_tensor = ggml_new_f32(lora_ctx, scaling);
                 BA = ggml_scale(lora_ctx, BA, scale_tensor);
             }
 
+            //printf("%s: (B)(%d %d %d %d) x (A)(%d %d %d %d) => (BA)(%d %d %d %d) + (T)(%d %d %d %d)\n",
+            //    base_name.c_str(),
+            //    (int)loraB->ne[0],  (int)loraB->ne[1],  (int)loraB->ne[2],  (int)loraB->ne[3],
+            //    (int)loraA->ne[0],  (int)loraA->ne[1],  (int)loraA->ne[2],  (int)loraA->ne[3],
+            //    (int)BA->ne[0],     (int)BA->ne[1],     (int)BA->ne[2],     (int)BA->ne[3],
+            //    (int)tensor->ne[0], (int)tensor->ne[1], (int)tensor->ne[2], (int)tensor->ne[3]
+            //);
             ggml_tensor * r = ggml_add_inplace(lora_ctx, tensor, BA);
-            //ggml_tensor * r = ggml_add(lora_ctx, tensor, BA);
-            //r = ggml_cpy(lora_ctx, r, tensor);
 
             struct ggml_cgraph gf = ggml_build_forward(r);
             gf.n_threads = n_threads;
             ggml_graph_compute(lora_ctx, &gf);
 
-            // hack until ggml_cpy supports quantized tensors
-            // memcpy(tensor->data, r->data, ggml_nbytes(tensor));
-
             // we won't need these tensors again, reset the context to save memory
             ggml_free(lora_ctx);
             lora_ctx = ggml_init(params);