Add new evaluation metrics

jainapurva · jainapurva · commit 5a1803e6e4b5 · 2024-09-24T11:52:38.000-07:00
diff --git a/torchao/_models/llama/eval.py b/torchao/_models/llama/eval.py
@@ -42,6 +42,7 @@ def run_evaluation(
     calibration_limit: Optional[int] = None,
     calibration_seq_length: Optional[int] = None,
     pad_calibration_inputs: Optional[bool] = False,
+    eval_difficulty: Optional[str] = "easy",
 ):
     """Runs the evaluation of a model using LM Eval."""
     print(
@@ -179,6 +180,12 @@ def run_evaluation(
             model.to(device)
             model.reset_caches()
 
+    # Select eval tasks based on difficulty level
+    if eval_difficulty == "medium":
+        tasks.extend(['mmlu'])
+    elif eval_difficulty == "hard":
+        tasks.extend(['mmlu', 'truthfulqa_mc2', 'winogrande', 'arc_challenge', 'hellaswag', 'gsm8k'])
+
     if compile:
         model = torch.compile(model, mode="max-autotune", fullgraph=True)
     with torch.no_grad():
@@ -218,6 +225,7 @@ def run_evaluation(
     parser.add_argument('--calibration_limit', type=int, default=1000, help='number of samples to use for gptq calibration')
     parser.add_argument('--calibration_seq_length', type=int, default=100, help='length of sequences to use for gptq calibration')
     parser.add_argument('--pad_calibration_inputs', type=bool, default=False, help='pads sequences shorter than calibration_seq_length to that length, yielding more calibration inputs but running much slower')
+    parser.add_argument('--eval_difficulty', type=str, default="easy", help='difficulty of eval, one of [easy, medium, hard]')
 
     args = parser.parse_args()
     run_evaluation(
@@ -233,4 +241,5 @@ def run_evaluation(
         args.calibration_limit,
         args.calibration_seq_length,
         args.pad_calibration_inputs,
+        args.eval_difficulty,
     )