-
Notifications
You must be signed in to change notification settings - Fork 70
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
separate gpt_evals + test for conv eval
- Loading branch information
Showing
7 changed files
with
116 additions
and
77 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,23 @@ | ||
from ultravox.evaluation import eval_types | ||
from ultravox.evaluation import gpt_eval | ||
|
||
BOOLQ_SYSTEM_PROMPT = f""" | ||
You are an expert evaluator of AI systems. | ||
Given a question with a known true/false answer, you will be rating the correctness of an AI model's answer to that same question. | ||
Based on the supplied question, answer, and expected (correct) answer, you will rate the model's answer as either correct or incorrect. | ||
Award 1 point if the model's answer matches the correct answer, and 0 points if the model's answer does not match, or cannot be converted to a true/false verdict. | ||
Model answers of the form "True", "Yes", "Yeah", etc., should be considered to match a True answer. | ||
Model answers of the form "False", "No", "Incorrect", etc., should be considered to match a False answer. | ||
Only use the supplied correct answer to make your decision; DO NOT use your own knowledge to determine correctness. | ||
Your response MUST start with either 0 or 1, followed by a space, and then a brief explanation for why you awarded that score. | ||
""" | ||
BOOLQ_USER_PROMPT = """ | ||
Using the supplied correct answer as ground truth, evaluate the model's answer to the question below: | ||
Question: {{ question }} | ||
Model answer: {{ generated_answer }} | ||
Correct answer: {{ expected_answer }} | ||
""" | ||
|
||
|
||
def evaluate_answer_boolq(sample: eval_types.Sample) -> eval_types.InstructResult: | ||
return gpt_eval._evaluate_answer_gpt(BOOLQ_SYSTEM_PROMPT, BOOLQ_USER_PROMPT, sample) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,29 @@ | ||
from ultravox.evaluation import eval_types | ||
from ultravox.evaluation import gpt_eval | ||
|
||
CONVO_SYSTEM_PROMPT = f""" | ||
You are an expert evaluator of conversational AI systems. | ||
Given a conversation between two parties, the role of the AI system was to follow the flow of the conversation and respond appropriately. | ||
You are given the conversation, the AI model's response, and an exemplary (correct) response. | ||
The AI model response might be truncated, but that should not affect your evaluation. | ||
Your should award 1 point if the model's response is appropriate and follows the conversation, and 0 points if it does not, such as being off-topic or nonsensical. | ||
Your response MUST start with either 0 or 1, followed by a space, and then an explanation for why you awarded that score. | ||
""" | ||
|
||
CONVO_USER_PROMPT = """ | ||
Using the supplied example of a correct answer, evaluate the model's ability to follow the flow of the conversation in the last message: | ||
Conversation: | ||
{%- for turn in history + [ {"role": "user", "content": question} ] %} | ||
{% if turn["role"] == "user" %}A{% else %}B{% endif %}: {{ turn["content"] }} | ||
{% endfor %} | ||
Model (as B): {{ generated_answer }} | ||
Correct: {{ expected_answer }} | ||
""" | ||
|
||
|
||
def evaluate_conversation_response( | ||
sample: eval_types.Sample, | ||
) -> eval_types.InstructResult: | ||
sample.history = [msg for msg in sample.history if msg["role"] != "system"] | ||
return gpt_eval._evaluate_answer_gpt(CONVO_SYSTEM_PROMPT, CONVO_USER_PROMPT, sample) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,26 @@ | ||
from ultravox.evaluation import eval_types | ||
from ultravox.evaluation import gpt_eval | ||
|
||
INSTRUCT_SYSTEM_PROMPT = f""" | ||
You are an expert evaluator of AI systems. | ||
Given a question with a specified instruction, you will be rating the correctness of an AI model's ability to follow that instruction. | ||
Based on the supplied answer, and exemplary (correct) answer, you will rate the model's answer as either correct or incorrect. | ||
Award 1 point if the model followed the instruction, and 0 points if it did not. | ||
For example, given a question with an instruction of "Write a sentence about pickleball", | ||
- if the model responds "Pickleball is a tennis-like game played with a wiffle ball.", you should award 1 point. | ||
- if the model responds "Pickleball is a type of fruit", you should award 0 points. | ||
- if the model responds with something off-topic or nonsensical, you should award 0 points. | ||
Your response MUST start with either 0 or 1, followed by a space, and then an explanation for why you awarded that score. | ||
""" | ||
INSTRUCT_USER_PROMPT = """ | ||
Using the supplied correct answer as an example, evaluate the model's ability to follow the instructions in the question below: | ||
Question: {{ question }} | ||
Model answer: {{ generated_answer }} | ||
Correct answer: {{ expected_answer }} | ||
""" | ||
|
||
|
||
def evaluate_answer_instruct(sample: eval_types.Sample) -> eval_types.InstructResult: | ||
return gpt_eval._evaluate_answer_gpt( | ||
INSTRUCT_SYSTEM_PROMPT, INSTRUCT_USER_PROMPT, sample | ||
) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,31 @@ | ||
import re | ||
from unittest import mock | ||
|
||
from ultravox.evaluation import eval_types | ||
from ultravox.evaluation import gpt_eval | ||
from ultravox.evaluation import gpt_eval_conv | ||
|
||
|
||
def test_evaluate_conversation(): | ||
gpt_eval.client = mock.MagicMock() | ||
sample = eval_types.Sample( | ||
history=[ | ||
{"role": "system", "content": "Blah blah blah"}, | ||
{"role": "user", "content": "T1"}, | ||
{"role": "assistant", "content": "T2"}, | ||
], | ||
question="T3", | ||
generated_answer="T4", | ||
expected_answer="EXP", | ||
) | ||
expected_turns = "A: T1\n\nB: T2\n\nA: T3\n\nModel (as B): T4\nCorrect: EXP" | ||
|
||
gpt_eval_conv.evaluate_conversation_response(sample) | ||
|
||
completion_args = gpt_eval.client.chat.completions.create.call_args[1] | ||
assert len(completion_args["messages"]) == 2 | ||
assert completion_args["messages"][0]["role"] == "system" | ||
assert completion_args["messages"][1]["role"] == "user" | ||
gpt_question = re.sub("\n *", "\n", completion_args["messages"][1]["content"]) | ||
assert expected_turns in gpt_question | ||
assert "Blah blah blah" not in gpt_question |