make conversion user dependent on model type, defaults for original paper models

vasqu · vasqu · commit ae43243dbf3e · 2024-08-20T22:04:56.000+02:00
diff --git a/src/transformers/models/mamba2/convert_mamba2_ssm_checkpoint_to_pytorch.py b/src/transformers/models/mamba2/convert_mamba2_ssm_checkpoint_to_pytorch.py
@@ -16,8 +16,9 @@
 
 import argparse
 import json
+from functools import partial
 from os import path
-from typing import Dict, Optional, Tuple
+from typing import Dict, Optional
 
 import torch
 from safetensors import safe_open
@@ -26,46 +27,26 @@
 from transformers import LlamaTokenizerFast, Mamba2Config, Mamba2ForCausalLM, Mamba2TokenizerFast
 
 
-_MAMBA2_MODELS_DICT = {
-    "codestral": {
-        "hidden_size": "dim",
-        "num_hidden_layers": "n_layers",
-        "n_groups": "n_groups",
-        "residual_in_fp32": "residual_in_fp32",
-        "tie_word_embeddings": "tie_embeddings",
-        "norm_before_gate": False,
-        "vocab_size": "vocab_size",
-        "pad_vocab_size_multiple": "pad_vocab_size_multiple",
-        "bos_token_id": 0,
-        "pad_token_id": 1,
-        "eos_token_id": 2,
-    },
-    "base": {
-        "hidden_size": "d_model",
-        "num_hidden_layers": "n_layer",
-        "n_groups": "ngroups",
-        "residual_in_fp32": "residual_in_fp32",
-        "tie_word_embeddings": "tie_embeddings",
-        "norm_before_gate": False,
-        "vocab_size": "vocab_size",
-        "pad_vocab_size_multiple": "pad_vocab_size_multiple",
-        "bos_token_id": 0,
-        "pad_token_id": 0,
-        "eos_token_id": 0,
-    },
-}
+def load_state_dict_from_safetensors(mamba2_checkpoint_path: str, ckpt_name: str) -> Dict[str, torch.Tensor]:
+    # Load weights and config from paths
+    original_state_dict = {}
+    with safe_open(path.join(mamba2_checkpoint_path, ckpt_name), framework="pt") as f:
+        for k in f.keys():
+            newk = k.removeprefix("model.")
+            original_state_dict[newk] = f.get_tensor(k).clone()
+    return original_state_dict
 
 
-def convert_ssm_config_to_hf_config(config_ssm: Dict) -> Tuple[Mamba2Config, bool]:
+def load_state_dict_from_torch(mamba2_checkpoint_path: str, ckpt_name: str) -> Dict[str, torch.Tensor]:
+    return torch.load(path.join(mamba2_checkpoint_path, ckpt_name), map_location="cpu")
+
+
+def convert_ssm_config_to_hf_config(config_ssm: Dict, mamba2_model_dict: Dict) -> Mamba2Config:
     """Convert a Mamba2Config from mamba_ssm to a Mamba2Config from here."""
     hf_config = Mamba2Config()
 
-    # Flag for codestral model
-    is_not_codestral = "dim" not in config_ssm
-
     # Switch to a different dict depending on model type
-    config_key = "base" if is_not_codestral else "codestral"
-    config_dict = _MAMBA2_MODELS_DICT[config_key]
+    config_dict = mamba2_model_dict
 
     # Set important values from config and recalculate other resulting entries
     hf_config.hidden_size = config_ssm[config_dict["hidden_size"]]
@@ -86,43 +67,82 @@ def convert_ssm_config_to_hf_config(config_ssm: Dict) -> Tuple[Mamba2Config, boo
         vocab_size += pad_vocab_size_multiple - (vocab_size % pad_vocab_size_multiple)
     hf_config.vocab_size = vocab_size
 
-    return hf_config, is_not_codestral
+    return hf_config
 
 
-def load_state_dict_from_safetensors(mamba2_checkpoint_path: str) -> Dict[str, torch.Tensor]:
-    # Load weights and config from paths
-    original_state_dict = {}
-    with safe_open(path.join(mamba2_checkpoint_path, "consolidated.safetensors"), framework="pt") as f:
-        for k in f.keys():
-            newk = k.removeprefix("model.")
-            original_state_dict[newk] = f.get_tensor(k).clone()
-    return original_state_dict
+def load_and_save_tokenizer(
+    mamba2_model_type: str,
+    output_dir: str,
+    tokenizer_model_path: Optional[str] = None,
+) -> None:
+    tokenizer = None
 
+    # Load tokenizer
+    if tokenizer_model_path is not None and mamba2_model_type == "codestral":
+        tokenizer_class = LlamaTokenizerFast
+        tokenizer = tokenizer_class(tokenizer_model_path, legacy=False, from_slow=True)
+    elif mamba2_model_type == "mamba_ssm":
+        tokenizer = Mamba2TokenizerFast.from_pretrained("state-spaces/mamba-130m-hf")
 
-def load_state_dict_from_torch(mamba2_checkpoint_path: str) -> Dict[str, torch.Tensor]:
-    return torch.load(path.join(mamba2_checkpoint_path, "pytorch_model.bin"), map_location="cpu")
+    # Save tokenizer
+    if tokenizer is not None:
+        tokenizer.save_pretrained(output_dir)
+
+
+_MAMBA2_MODELS_DICT = {
+    "codestral": {
+        "hidden_size": "dim",
+        "num_hidden_layers": "n_layers",
+        "n_groups": "n_groups",
+        "residual_in_fp32": "residual_in_fp32",
+        "tie_word_embeddings": "tie_embeddings",
+        "norm_before_gate": False,
+        "vocab_size": "vocab_size",
+        "pad_vocab_size_multiple": "pad_vocab_size_multiple",
+        "bos_token_id": 0,
+        "pad_token_id": 1,
+        "eos_token_id": 2,
+        "config_name": "params.json",
+        "load_state_dict": partial(load_state_dict_from_safetensors, ckpt_name="consolidated.safetensors"),
+        "load_and_save_tokenizer": partial(load_and_save_tokenizer, "codestral"),
+    },
+    "mamba_ssm": {
+        "hidden_size": "d_model",
+        "num_hidden_layers": "n_layer",
+        "n_groups": "ngroups",
+        "residual_in_fp32": "residual_in_fp32",
+        "tie_word_embeddings": "tie_embeddings",
+        "norm_before_gate": False,
+        "vocab_size": "vocab_size",
+        "pad_vocab_size_multiple": "pad_vocab_size_multiple",
+        "bos_token_id": 0,
+        "pad_token_id": 0,
+        "eos_token_id": 0,
+        "config_name": "config.json",
+        "load_state_dict": partial(load_state_dict_from_torch, ckpt_name="pytorch_model.bin"),
+        "load_and_save_tokenizer": partial(load_and_save_tokenizer, "mamba_ssm"),
+    },
+}
 
 
 def convert_mamba2_checkpoint_file_to_huggingface_model_file(
-    mamba2_checkpoint_path: str, precision: str, output_dir: str, tokenizer_model_path: Optional[str] = None
+    mamba2_checkpoint_path: str,
+    mamba2_model_type: str,
+    precision: str,
+    output_dir: str,
+    tokenizer_model_path: Optional[str] = None,
 ) -> None:
+    mamba2_model_dict = _MAMBA2_MODELS_DICT[mamba2_model_type]
+
     # Load and save config based on name
-    config_path = mamba2_checkpoint_path
-    config_path = (
-        path.join(config_path, "params.json")
-        if path.isfile(path.join(config_path, "params.json"))
-        else path.join(config_path, "config.json")
-    )
+    config_path = path.join(mamba2_checkpoint_path, mamba2_model_dict["config_name"])
     with open(config_path, "r", encoding="utf-8") as json_file:
         config = json.load(json_file)
-    hf_config, is_not_codestral = convert_ssm_config_to_hf_config(config)
+    hf_config = convert_ssm_config_to_hf_config(config_ssm=config, mamba2_model_dict=mamba2_model_dict)
     hf_config.save_pretrained(output_dir)
 
-    # Load state dict of the original model
-    state_dict_load_function = load_state_dict_from_torch if is_not_codestral else load_state_dict_from_safetensors
-    original_state_dict = state_dict_load_function(mamba2_checkpoint_path)
-
-    # Load and transfer to hf model
+    # Load state dict of the original model and transfer to hf model
+    original_state_dict = mamba2_model_dict["load_state_dict"](mamba2_checkpoint_path=mamba2_checkpoint_path)
     hf_model = Mamba2ForCausalLM(hf_config)
     hf_model.load_state_dict(original_state_dict)
 
@@ -131,12 +151,7 @@ def convert_mamba2_checkpoint_file_to_huggingface_model_file(
     save_model(hf_model.to(dtype), path.join(output_dir, "model.safetensors"), metadata={"format": "pt"})
 
     # Load and save tokenizer
-    if tokenizer_model_path is not None and not is_not_codestral:
-        tokenizer_class = LlamaTokenizerFast
-        tokenizer = tokenizer_class(tokenizer_model_path, legacy=False, from_slow=True)
-    else:
-        tokenizer = Mamba2TokenizerFast.from_pretrained("state-spaces/mamba-130m-hf")
-    tokenizer.save_pretrained(output_dir)
+    mamba2_model_dict["load_and_save_tokenizer"](output_dir=output_dir, tokenizer_model_path=tokenizer_model_path)
 
 
 if __name__ == "__main__":
@@ -148,13 +163,24 @@ def convert_mamba2_checkpoint_file_to_huggingface_model_file(
         required=True,
         help="Path to a directory containing the `pytorch_model.bin` or `.safetensors` mamba2_ssm checkpoint file to be converted.",
     )
+    parser.add_argument(
+        "-m",
+        "--mamba2_model_type",
+        type=str,
+        default="mamba_ssm",
+        const="mamba_ssm",
+        required=True,
+        choices=("codestral", "mamba_ssm"),
+        help="The model type the conversion will be performed on. Can choose from either `codestral` or `mamba_ssm`.",
+    )
     parser.add_argument(
         "-p",
         "--precision",
         type=str,
-        default="bf16",
+        default="fp16",
+        const="fp16",
         required=True,
-        choices=["fp32", "fp16", "bf16"],
+        choices=("fp32", "fp16", "bf16"),
         help="The precision the model will be saved in. Select from fp32, fp16 or bf16.",
     )
     parser.add_argument(
@@ -171,5 +197,9 @@ def convert_mamba2_checkpoint_file_to_huggingface_model_file(
     args = parser.parse_args()
 
     convert_mamba2_checkpoint_file_to_huggingface_model_file(
-        args.mamba2_checkpoint_file, args.precision, args.output_dir, args.tokenizer_model_path
+        args.mamba2_checkpoint_directory,
+        args.mamba2_model_type,
+        args.precision,
+        args.output_dir,
+        args.tokenizer_model_path,
     )