Add new evaluation metrics

jainapurva · jainapurva · commit 130bfc41584a · 2024-09-24T13:40:31.000-07:00
diff --git a/torchao/_models/llama/eval.py b/torchao/_models/llama/eval.py
@@ -42,12 +42,20 @@ def run_evaluation(
     calibration_limit: Optional[int] = None,
     calibration_seq_length: Optional[int] = None,
     pad_calibration_inputs: Optional[bool] = False,
+    eval_difficulty: Optional[str] = "easy",
 ):
     """Runs the evaluation of a model using LM Eval."""
+
+    # Select eval tasks based on difficulty level
+    if eval_difficulty == "medium":
+        tasks.extend(['mmlu'])
+    elif eval_difficulty == "hard":
+        tasks.extend(['mmlu', 'truthfulqa_mc2', 'winogrande', 'arc_challenge', 'hellaswag', 'gsm8k'])
+
     print(
         f"\nEvaluating model {checkpoint_path} on tasks: {tasks}, limit: {limit}, device: {device}, precision: {precision}, "
         +f"quantization: {quantization}, compile: {compile}, max_length: {max_length}, calibration_tasks: {calibration_tasks}, "
-        +f"calibration_seq_length: {calibration_seq_length}, pad_calibration_inputs: {pad_calibration_inputs}\n"
+        +f"calibration_seq_length: {calibration_seq_length}, pad_calibration_inputs: {pad_calibration_inputs}, eval_difficulty: {eval_difficulty}\n"
     )
     torchao.quantization.utils.recommended_inductor_config_setter()
 
@@ -218,6 +226,7 @@ def run_evaluation(
     parser.add_argument('--calibration_limit', type=int, default=1000, help='number of samples to use for gptq calibration')
     parser.add_argument('--calibration_seq_length', type=int, default=100, help='length of sequences to use for gptq calibration')
     parser.add_argument('--pad_calibration_inputs', type=bool, default=False, help='pads sequences shorter than calibration_seq_length to that length, yielding more calibration inputs but running much slower')
+    parser.add_argument('-d', '--eval_difficulty', type=str, default="easy", help='difficulty of eval, one of [easy, medium, hard]')
 
     args = parser.parse_args()
     run_evaluation(
@@ -233,4 +242,5 @@ def run_evaluation(
         args.calibration_limit,
         args.calibration_seq_length,
         args.pad_calibration_inputs,
+        args.eval_difficulty,
     )