1
+ import os
2
+ import re
3
+ import argparse
4
+ from tqdm import tqdm
5
+
6
+ from transformers import pipeline , set_seed
7
+ from transformers import AutoTokenizer , AutoModelForCausalLM
8
+ from transformers .pipelines .base import Pipeline
9
+
10
+ from human_eval .data import write_jsonl , read_problems
11
+
12
+ def load_generation_pipe (model_name_or_path : str , gpu_device : int = 0 ):
13
+ model = AutoModelForCausalLM .from_pretrained (model_name_or_path )
14
+ tokenizer = AutoTokenizer .from_pretrained (model_name_or_path )
15
+
16
+ pipe = pipeline (
17
+ 'text-generation' ,
18
+ model = model ,
19
+ tokenizer = tokenizer ,
20
+ device = gpu_device
21
+ )
22
+
23
+ print ("load generation pipeline from {} over, vocab size = {}, eos id = {}, gpu device = {}." .format (
24
+ model_name_or_path , len (tokenizer ), tokenizer .eos_token_id , gpu_device )
25
+ )
26
+
27
+ return pipe
28
+
29
+ def extract_function_block (string ):
30
+ return re .split ("\n class|\n def|\n #|\n @|\n print|\n if" , string )[0 ].rstrip ()
31
+
32
+ def run_code_generation (pipe , prompt , num_completions = 1 , ** gen_kwargs ):
33
+ set_seed (123 )
34
+
35
+ code_gens = pipe (prompt ,
36
+ num_return_sequences = num_completions ,
37
+ ** gen_kwargs
38
+ )
39
+
40
+ return [extract_function_block (code_gen ["generated_text" ][len (prompt ):]) for code_gen in code_gens ]
41
+
42
+ def evaluate_on_human_eval (
43
+ model_name_or_path : str ,
44
+ temperature : float ,
45
+ top_p : float ,
46
+ num_samples_per_task : int ,
47
+ max_new_tokens : int ,
48
+ gpu_device : int ,
49
+ output_dir : str ,
50
+ ) -> str :
51
+
52
+ pipe : Pipeline = load_generation_pipe (model_name_or_path , gpu_device = gpu_device )
53
+ eval_name = f"human_eval.t{ temperature } .p{ top_p } .l{ max_new_tokens } .n{ num_samples_per_task } "
54
+
55
+ if output_dir is None :
56
+ if os .path .exists (model_name_or_path ):
57
+ output_dir = model_name_or_path
58
+ else :
59
+ raise ValueError ("Output dir can't be null if you are not evaluation a local model." )
60
+
61
+ os .makedirs (output_dir , exist_ok = True )
62
+ saved_path = os .path .join (output_dir , f"{ eval_name } .samples.jsonl" )
63
+
64
+ gen_kwargs = {
65
+ "do_sample" : True ,
66
+ "temperature" : temperature ,
67
+ "max_new_tokens" : max_new_tokens ,
68
+ "top_p" : top_p ,
69
+ "top_k" : 0 ,
70
+ "pad_token_id" : pipe .tokenizer .pad_token_id if pipe .tokenizer .pad_token_id else pipe .tokenizer .eos_token_id ,
71
+ "eos_token_id" : pipe .tokenizer .eos_token_id
72
+ }
73
+
74
+ problems = read_problems ()
75
+ samples = []
76
+ generate_batch_size = min (50 , num_samples_per_task )
77
+
78
+ bos_token = pipe .tokenizer .bos_token if pipe .tokenizer .bos_token else pipe .tokenizer .eos_token
79
+
80
+ for task_id in tqdm (problems ):
81
+ # Strip operation is important as new tokenizer will not treat '\n' as a independent token
82
+ prompt = problems [task_id ]["prompt" ].strip ()
83
+
84
+ for _ in range (num_samples_per_task // generate_batch_size ):
85
+ input_prompt = bos_token + prompt
86
+ gen_results = run_code_generation (pipe , input_prompt , num_completions = generate_batch_size , ** gen_kwargs )
87
+ for gen_result in gen_results :
88
+ samples .append (dict (task_id = task_id , completion = gen_result ))
89
+
90
+ write_jsonl (saved_path , samples )
91
+ print ("Run generation over, save {} samples to {}." .format (len (samples ), saved_path ))
92
+
93
+ if __name__ == '__main__' :
94
+ parser = argparse .ArgumentParser (description = 'Run evaluation for code generation model on human-eval.' )
95
+
96
+ parser .add_argument ('-model' , '--model_name_or_path' , type = str , required = True )
97
+ parser .add_argument ('-o' , '--output_dir' , type = str , default = None )
98
+ parser .add_argument ('-n' , '--num_completions' , type = int , default = 100 )
99
+ parser .add_argument ('-t' , '--temperature' , type = float , default = 0.2 )
100
+ parser .add_argument ('-p' , '--top_p' , type = float , default = 0.95 )
101
+ parser .add_argument ('-l' , '--max_new_tokens' , type = int , default = 100 )
102
+ parser .add_argument ('-gpu' , "--gpu_device" , type = int , default = 0 )
103
+
104
+ args = parser .parse_args ()
105
+
106
+ evaluate_on_human_eval (
107
+ model_name_or_path = args .model_name_or_path ,
108
+ temperature = args .temperature ,
109
+ top_p = args .top_p ,
110
+ num_samples_per_task = args .num_completions ,
111
+ max_new_tokens = args .max_new_tokens ,
112
+ gpu_device = args .gpu_device ,
113
+ output_dir = args .output_dir ,
114
+ )
115
+ pass
0 commit comments