From e55750949bf0126e2e5702aec616d2c8cf496463 Mon Sep 17 00:00:00 2001
From: mzio <mzhang2016@gmail.com>
Date: Thu, 19 Sep 2024 18:19:34 -0700
Subject: [PATCH] Update lm eval model

---
 lm_eval_harness/eval_lm_harness_big.py | 30 +++++++++++++++++++++-----
 lm_eval_harness/models.py              | 19 ++++++++++++++++
 2 files changed, 44 insertions(+), 5 deletions(-)

diff --git a/lm_eval_harness/eval_lm_harness_big.py b/lm_eval_harness/eval_lm_harness_big.py
index d759380..6f539e0 100644
--- a/lm_eval_harness/eval_lm_harness_big.py
+++ b/lm_eval_harness/eval_lm_harness_big.py
@@ -207,6 +207,20 @@ def count_params(module) -> int:
     return sum(p.numel() for p in module.parameters())
 
 
+def check_state_dict_keys(_keys, layer_idx, rank=0):
+    try:
+        assert len(_keys.unexpected_keys) == 0
+        if rank == 0:
+            print_header(f'*** All expected keys matched successfully {layer_idx} ***')
+    except Exception as e:
+        if rank == 0:
+            print(e)
+            print_header('*** Error: unexpected keys in checkpoint ***')
+            print(f'Unexpected keys at {layer_idx}:')
+            for k in _keys.unexpected_keys:
+                print(k)
+
+
 def main():
     sys.path.append(LM_EVALUATION_HARNESS_PATH)
     from lm_eval import evaluator
@@ -344,7 +358,8 @@ def main():
                                                             peft_gradient_checkpointing=not args.no_peft_grad_ckpt,
                                                             train_attention=False)
         if True:  # rank == 0:
-            if distill_config.trainer.name is not None or args.attn_mlp_checkpoint_path is not None:
+            # if distill_config.trainer.name is not None or args.attn_mlp_checkpoint_path is not None:
+            if distill_config.trainer.name is not None and args.attn_mlp_checkpoint_path is not None:
                 # if args.replicate == 64:
                 #     distill_config.model_name = distill_config.model_name.replace(f'-se={args.seed}', '-se=0').replace(f'-s={args.seed}', '-s=0')
                 # else:
@@ -366,10 +381,15 @@ def main():
                                                         merge_loras=False,
                                                         peft_gradient_checkpointing=not args.no_peft_grad_ckpt)
         if True:  # rank == 0:
-            model = load_sharded_model_single_gpu(model, model_path=args.finetune_checkpoint_path,  #  None,
-                                                cfg=finetune_config, rank=rank)
+            if '.pt' in args.finetune_checkpoint_path:
+                with torch.no_grad():
+                    _keys = model.load_state_dict(torch.load(args.finetune_checkpoint_path), strict=False)
+                    check_state_dict_keys(_keys, 0)
+            else:
+                model = load_sharded_model_single_gpu(model, model_path=args.finetune_checkpoint_path,  #  None,
+                                                    cfg=finetune_config, rank=rank)
             
-        if rank == 0:
+        if True:  # if rank == 0:
             print_header('** Sanity check model weights **')
             for n, p in model.named_parameters():
                 # if ('layers.0.' in n and ('feature_map' in n or 'lora' in n)):
@@ -421,4 +441,4 @@ def main():
 
 
 if __name__ == '__main__':
-    main()
\ No newline at end of file
+    main()
diff --git a/lm_eval_harness/models.py b/lm_eval_harness/models.py
index 2b8fc68..f9a6d9c 100644
--- a/lm_eval_harness/models.py
+++ b/lm_eval_harness/models.py
@@ -9,6 +9,7 @@
 from src.model.modeling_mistral import LooooolcatsMistralForCausalLM as LOOOOOLCATS_MISTRAL_MODEL_CLASS
 
 from src.model.modeling_llama_sharded import ShardedLolcatsLlamaForCausalLM as SHARDED_LOLCATS_LLAMA_MODEL_CLASS
+from src.model.modeling_llama_sharded_roll import ShardedRollLolcatsLlamaForCausalLM as SHARDED_ROLL_LOLCATS_LLAMA_MODEL_CLASS
 
 
 class LolcatsLlamaForCausalLM(AutoCausalLM):
@@ -63,6 +64,24 @@ def add_special_tokens(self) -> bool:
             return self._add_special_tokens
         else:
             return False
+
+
+class ShardedRollLolcatsLlamaForCausalLM(AutoCausalLM):
+    """
+    Wrapper for Llama or Mistral-like autoregressive language model
+    """
+    AUTO_MODEL_CLASS = SHARDED_ROLL_LOLCATS_LLAMA_MODEL_CLASS
+    @property
+    def add_special_tokens(self) -> bool:
+        """Whether to include special tokens in encoded text. This should be
+        determined by whether or not the model was trained with special tokens.
+        TODO: Remove these conditionals once HuggingFace supports a way to
+        check whether or not an arbitrary model was trained with special tokens.
+        """
+        if self._add_special_tokens is not None:
+            return self._add_special_tokens
+        else:
+            return False
         
 
 class LooooolcatsLlamaForCausalLM(AutoCausalLM):