scripts: use GGMF version of convert-pth-to-ggml.py

automationkit · Apr 4, 2023 · 4bc0c03 · 4bc0c03
1 parent f103ae9
commit 4bc0c03
Showing 1 changed file with 56 additions and 151 deletions.
diff --git a/scripts/convert-pth-to-ggml.py b/scripts/convert-pth-to-ggml.py
@@ -1,4 +1,4 @@
-# Convert a LLaMA model checkpoint to a ggjt compatible file
+# Convert a LLaMA model checkpoint to a ggml compatible file
 #
 # Load the model using Torch
 # Iterate over all variables and write them to a binary file.
@@ -24,64 +24,16 @@
 
 from sentencepiece import SentencePieceProcessor
 
-QK = 32
-
-GGML_TYPE_Q4_0  = 0
-GGML_TYPE_Q4_1  = 1
-GGML_TYPE_I8    = 2
-GGML_TYPE_I16   = 3
-GGML_TYPE_I32   = 4
-GGML_TYPE_F16   = 5
-GGML_TYPE_F32   = 6
-
-WTYPES = {
-    0: GGML_TYPE_F32,
-    1: GGML_TYPE_F16,
-    2: GGML_TYPE_Q4_0,
-    3: GGML_TYPE_Q4_1,
-}
-
-GGML_BLCK_SIZE = {
-    GGML_TYPE_Q4_0:  QK,
-    GGML_TYPE_Q4_1:  QK,
-    GGML_TYPE_I8:    1,
-    GGML_TYPE_I16:   1,
-    GGML_TYPE_I32:   1,
-    GGML_TYPE_F16:   1,
-    GGML_TYPE_F32:   1,
-}
-
-GGML_TYPE_SIZE = {
-    GGML_TYPE_Q4_0: 4   + QK//2,
-    GGML_TYPE_Q4_1: 4*2 + QK//2,
-    GGML_TYPE_I8:   1,
-    GGML_TYPE_I16:  2,
-    GGML_TYPE_I32:  4,
-    GGML_TYPE_F16:  2,
-    GGML_TYPE_F32:  4,
-}
-
-def ggml_nelements(shape):
-    r = 1
-    for i in shape:
-        r *= i
-    return r
-
-def ggml_nbytes(shape, ftype):
-    x = ggml_nelements(shape)
-    t = WTYPES[ftype]
-    x *= GGML_TYPE_SIZE[t]
-    x //= GGML_BLCK_SIZE[t]
-    return x
-
 def parse_args():
+
     parser = argparse.ArgumentParser(description='Convert a LLaMA model checkpoint to a ggml compatible file')
     parser.add_argument('dir_model',  help='directory containing the model checkpoint')
     parser.add_argument('ftype',      help='file type (0: float32, 1: float16)', type=int, choices=[0, 1], default=1)
     parser.add_argument('vocab_only', help='only write vocab to file', type=int, default=0, nargs='?')
     return parser.parse_args()
 
 def get_n_parts(dim):
+
     mappings = {4096: 1, 5120: 2, 6656: 4, 8192: 8}
     n_parts = mappings.get(dim)
     if n_parts is None:
@@ -92,24 +44,30 @@ def get_n_parts(dim):
     return n_parts
 
 def load_hparams_and_tokenizer(dir_model):
+
     # `dir_model` is something like `models/7B` or `models/7B/`.
     # "tokenizer.model" is expected under model's parent dir.
     # When `dir_model` is a symlink, f"{dir_model}/../tokenizer.model" would not be found.
     # Let's use the model's parent dir directly.
     model_parent_dir = os.path.dirname(os.path.normpath(dir_model))
+
     fname_hparams = f"{dir_model}/params.json"
     fname_tokenizer = f"{model_parent_dir}/tokenizer.model"
+
     with open(fname_hparams, "r") as f:
         hparams = json.load(f)
         print(hparams)
+
     tokenizer = SentencePieceProcessor(fname_tokenizer)
     hparams.update({"vocab_size": tokenizer.vocab_size()})
+
     return hparams, tokenizer
 
 def write_header(fout, hparams, ftype):
+
     keys = ["vocab_size", "dim", "multiple_of", "n_heads", "n_layers"]
     values = [
-        0x67676a74,  # magic: ggjt in hex
+        0x67676d66,  # magic: ggmf in hex
         1, # file version
         *[hparams[key] for key in keys],
         hparams["dim"] // hparams["n_heads"],  # rot (obsolete)
@@ -118,9 +76,10 @@ def write_header(fout, hparams, ftype):
     fout.write(struct.pack("i" * len(values), *values))
 
 def write_tokens(fout, tokenizer):
+
     for i in range(tokenizer.vocab_size()):
         if tokenizer.is_unknown(i):
-            text = " \u2047 ".encode()
+            text = " \u2047 ".encode("utf-8")
         elif tokenizer.is_control(i):
             text = b""
         elif tokenizer.is_byte(i):
@@ -131,144 +90,90 @@ def write_tokens(fout, tokenizer):
             byte_value = int(piece[3:-1], 16)
             text = struct.pack("B", byte_value)
         else:
-            text = tokenizer.id_to_piece(i).replace("\u2581", " ").encode()
+            text = tokenizer.id_to_piece(i).replace("\u2581", " ").encode("utf-8")
         fout.write(struct.pack("i", len(text)))
         fout.write(text)
         fout.write(struct.pack("f", tokenizer.get_score(i)))
 
-def process_and_write_variables(fout, model, ftype, part_id, n_parts):
+def process_and_write_variables(fout, model, ftype):
+
     for name, datao in model.items():
+
         if name.endswith("freqs"):
             continue
 
-        # remove dimensions with a single element
-        data = datao.numpy().squeeze()
-        partshape = data.shape
-        n_dims = len(data.shape)
-        assert n_dims in (1, 2)
+        shape = datao.shape
 
-        print(f"Processing variable: {name} with shape: {partshape} and type: {datao.dtype}")
+        print(f"Processing variable: {name} with shape: {shape} and type: {datao.dtype}")
 
-        # coerce single-dimensional tensors from float16 to float32
+        data = datao.numpy().squeeze()
+        n_dims = len(shape)
+
+        # default type is fp16
         ftype_cur = 1
         if ftype == 0 or n_dims == 1:
             print("  Converting to float32")
             data = data.astype(np.float32)
             ftype_cur = 0
-        blck_size = GGML_BLCK_SIZE[WTYPES[ftype_cur]]
-        type_size = GGML_TYPE_SIZE[WTYPES[ftype_cur]]
-
-        # determine dimension along which multipart tensor is sharded
-        #
-        # split_dim 0 regex:
-        #   - output.*
-        #   - layers.*.attention.wq.weight
-        #   - layers.*.attention.wk.weight
-        #   - layers.*.attention.wv.weight
-        #   - layers.*.feed_forward.w1.weight
-        #   - layers.*.feed_forward.w3.weight
-        #
-        # split_dim 1 regex:
-        #   - tok_embeddings.*
-        #   - layers.*.attention.wo.weight
-        #   - layers.*.feed_forward.w2.weight
-        #
-        if n_dims > 1:
-            split_dim = 1
-            if "tok_embeddings" in name:
-                split_dim = 1
-            elif "layers" in name:
-                if "attention.wo.weight" in name:
-                    split_dim = 1
-                elif "feed_forward.w2.weight" in name:
-                    split_dim = 1
-                else:
-                    split_dim = 0
-            elif "output" in name:
-                split_dim = 0
-
-        # output tensor header
-        fullshape = list(partshape)
-        if n_dims > 1:
-            fullshape[split_dim] *= n_parts
-        sname = name.encode()
-        fout.write(struct.pack("iii", n_dims, len(sname), ftype_cur))
-        for dim in reversed(fullshape):
+
+        # header
+        sname = name.encode('utf-8')
+        fout.write(struct.pack("iii", len(data.shape), len(sname), ftype_cur))
+        for dim in reversed(data.shape):
             fout.write(struct.pack("i", dim))
         fout.write(sname)
 
-        # ensure tensor data is aligned
-        tensor_data_offset = fout.tell()
-        while tensor_data_offset % QK != 0:
-            fout.write(struct.pack("B", 0))
-            tensor_data_offset += 1
-
-        # output unified mappable tensor data
-        if n_dims == 1 or n_parts == 1:
-            # copy tensor which we thankfully received in one piece
-            if part_id == 0:
-                data.tofile(fout)
-        elif split_dim == 0:
-            # reassemble multifile tensor containing some of the rows
-            rows_per_chunk = partshape[0]
-            current_row = part_id * rows_per_chunk
-            bytes_per_row = fullshape[1] // blck_size * type_size
-            offset = current_row * bytes_per_row
-            fout.seek(tensor_data_offset + offset)
-            data.tofile(fout)
-        elif split_dim == 1:
-            # reassemble multifile tensor containing some of the cols
-            cols_per_chunk = partshape[1]
-            current_col = part_id * cols_per_chunk
-            bytes_per_row = fullshape[1] // blck_size * type_size
-            offset_current_col = current_col // blck_size * type_size
-            for row in range(partshape[0]):
-                offset_row = row * bytes_per_row
-                offset = offset_row + offset_current_col
-                fout.seek(tensor_data_offset + offset)
-                data[row].tofile(fout)
-
-        # advance file position to next tensor
-        fout.seek(tensor_data_offset + ggml_nbytes(fullshape, ftype_cur))
+        # data output to file
+        data.tofile(fout)
 
 def main():
+
     args = parse_args()
     dir_model = args.dir_model
     ftype = args.ftype
     ftype_str = ["f32", "f16"]
+
     hparams, tokenizer = load_hparams_and_tokenizer(dir_model)
 
     print(args)
 
     # if only writing vocab to file
     if args.vocab_only:
+
         fname_model = f"{dir_model}/consolidated.00.pth"
         fname_out = f"{dir_model}/ggml-vocab.bin"
+
         print(f"Extracting only the vocab from '{fname_model}'\n")
+
+
         with open(fname_out, "wb") as fout:
             write_header(fout, hparams, ftype)
             write_tokens(fout, tokenizer)
+
+
         print(f"Done. Output file: {fname_out}\n")
+
         return
 
     n_parts = get_n_parts(hparams["dim"])
-    fname_out = f"{dir_model}/ggml-model-{ftype_str[ftype]}.bin"
-
-    # we output a single file for ggml
-    with open(fname_out, "wb") as fout:
-        write_header(fout, hparams, ftype)
-        write_tokens(fout, tokenizer)
-        offset_of_tensors = fout.tell()
-        # the tensors we load could be split across multiple files
-        for part_id in range(n_parts):
-            fout.seek(offset_of_tensors)
-            print(f"Processing part {part_id+1} of {n_parts}\n")
-            fname_model = f"{dir_model}/consolidated.0{part_id}.pth"
-            model = torch.load(fname_model, map_location="cpu")
-            process_and_write_variables(fout, model, ftype, part_id, n_parts)
-            del model
-
-    print(f"Done. Output file: {fname_out}\n")
+
+    for p in range(n_parts):
+
+        print(f"Processing part {p+1} of {n_parts}\n")
+
+        fname_model = f"{dir_model}/consolidated.0{p}.pth"
+        fname_out = f"{dir_model}/ggml-model-{ftype_str[ftype]}.bin{'' if p == 0 else '.' + str(p)}"
+
+        model = torch.load(fname_model, map_location="cpu")
+
+        with open(fname_out, "wb") as fout:
+            write_header(fout, hparams, ftype)
+            write_tokens(fout, tokenizer)
+            process_and_write_variables(fout, model, ftype)
+
+        del model
+
+        print(f"Done. Output file: {fname_out}, (part {p})\n")
 
 if __name__ == "__main__":
     main()