1
1
import torch
2
+ from tabulate import tabulate
2
3
3
4
from transformers import AutoModelForCausalLM , AutoTokenizer
4
5
try :
9
10
print ("""
10
11
Error: The 'lm_eval' module was not found.
11
12
To install, follow these steps:
12
-
13
- 1. Clone the repository:
14
- git clone https://github.com/EleutherAI/lm-evaluation-harness
15
-
16
- 2. Change to the cloned directory:
17
- cd lm-evaluation-harness
18
-
19
- 3. Install the package in editable mode:
20
- pip install -e .
21
-
22
- After installation, re-run this script to use the LM Evaluation Harness.
13
+ pip install git+https://github.com/EleutherAI/lm-evaluation-harness.git
23
14
""" )
24
15
raise # Re-raise the ImportError
25
16
33
24
torch ._inductor .config .force_fuse_int_mm_with_mul = True
34
25
torch ._inductor .config .fx_graph_cache = True
35
26
27
+ def pretty_print_nested_results (results , precision : int = 6 ):
28
+ def format_value (value ):
29
+ if isinstance (value , float ):
30
+ return f"{ value :.{precision }f} "
31
+ return value
32
+
33
+ main_table = []
34
+ for task , metrics in results ["results" ].items ():
35
+ subtable = [[k , format_value (v )] for k , v in metrics .items () if k != 'alias' ]
36
+ subtable .sort (key = lambda x : x [0 ]) # Sort metrics alphabetically
37
+ formatted_subtable = tabulate (subtable , tablefmt = 'grid' )
38
+ main_table .append ([task , formatted_subtable ])
39
+
40
+ print (tabulate (main_table , headers = ['Task' , 'Metrics' ], tablefmt = 'grid' ))
41
+
36
42
def run_evaluation (repo_id , tasks , limit , device , precision , quantization , compile , batch_size , max_length ):
37
43
38
44
tokenizer = AutoTokenizer .from_pretrained (repo_id )
@@ -50,7 +56,6 @@ def run_evaluation(repo_id, tasks, limit, device, precision, quantization, compi
50
56
change_linear_weights_to_int4_woqtensors (model .to (device = device ))
51
57
elif quantization == "autoquant" :
52
58
model = autoquant (model .to (device = device ))
53
-
54
59
with torch .no_grad ():
55
60
result = evaluate (
56
61
HFLM (
@@ -61,8 +66,8 @@ def run_evaluation(repo_id, tasks, limit, device, precision, quantization, compi
61
66
get_task_dict (tasks ),
62
67
limit = limit ,
63
68
)
64
- for task , res in result [ "results" ]. items ():
65
- print ( f" { task } : { res } " )
69
+
70
+ pretty_print_nested_results ( result )
66
71
67
72
68
73
if __name__ == '__main__' :
0 commit comments