77import typer
88
99from autora .doc .classes .EvalResult import EvalResult
10- from autora .doc .pipelines .metrics import eval_bleu_meteor
10+ from autora .doc .pipelines .metrics import eval_bleu_meteor , eval_semscore
1111from autora .doc .runtime .predict_hf import Predictor
1212from autora .doc .runtime .prompts import PROMPTS , PromptIds
1313from autora .doc .util import get_prompts_from_file
@@ -52,14 +52,8 @@ def eval_prompts(
5252 predictor = Predictor (model_path )
5353 for i in range (len (prompts_list )):
5454 logger .info (f"Starting to run model on prompt { i } " )
55- prediction_with_scores = eval_prompt (data_file , predictor , prompts_list [i ], param_dict )
55+ eval_result = eval_prompt (data_file , predictor , prompts_list [i ], param_dict )
5656 logger .info (f"Model run completed on prompt { i } : { prompts_list [i ]} " )
57- eval_result = EvalResult (
58- prediction_with_scores [0 ],
59- prompts_list [i ],
60- prediction_with_scores [1 ],
61- prediction_with_scores [2 ],
62- )
6357 results_list .append (eval_result )
6458 return results_list
6559
@@ -72,7 +66,7 @@ def eval(
7266 param : List [str ] = typer .Option (
7367 [], help = "Additional float parameters to pass to the model as name=float pairs"
7468 ),
75- ) -> Tuple [ List [ str ], float , float ] :
69+ ) -> EvalResult :
7670 import mlflow
7771
7872 mlflow .autolog ()
@@ -104,9 +98,7 @@ def load_data(data_file: str) -> Tuple[List[str], List[str]]:
10498 return inputs , labels
10599
106100
107- def eval_prompt (
108- data_file : str , pred : Predictor , prompt : str , param_dict : Dict [str , float ]
109- ) -> Tuple [List [str ], float , float ]:
101+ def eval_prompt (data_file : str , pred : Predictor , prompt : str , param_dict : Dict [str , float ]) -> EvalResult :
110102 import mlflow
111103
112104 inputs , labels = load_data (data_file )
@@ -115,6 +107,7 @@ def eval_prompt(
115107 predictions = pred .predict (prompt , inputs , ** param_dict )
116108 timer_end = timer ()
117109 bleu , meteor = eval_bleu_meteor (predictions , labels )
110+ semscore = eval_semscore (predictions , labels )
118111 pred_time = timer_end - timer_start
119112 mlflow .log_metric ("prediction_time/doc" , pred_time / (len (inputs )))
120113 for i in range (len (inputs )):
@@ -133,7 +126,8 @@ def eval_prompt(
133126 mlflow .log_metric ("tokens/sec" , total_tokens / pred_time )
134127 mlflow .log_metric ("bleu_score" , round (bleu , 5 ))
135128 mlflow .log_metric ("meteor_score" , round (meteor , 5 ))
136- return predictions , bleu , meteor
129+ mlflow .log_metric ("semscore" , round (semscore , 5 ))
130+ return EvalResult (predictions , prompt , bleu , meteor , semscore )
137131
138132
139133@app .command ()
0 commit comments