Skip to content

Commit

Permalink
separate gpt_evals + test for conv eval
Browse files Browse the repository at this point in the history
  • Loading branch information
farzadab committed Jun 25, 2024
1 parent 87aaf19 commit 5ba015c
Show file tree
Hide file tree
Showing 7 changed files with 116 additions and 77 deletions.
10 changes: 6 additions & 4 deletions ultravox/evaluation/eval.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
from ultravox.evaluation import eval_types
from ultravox.evaluation import gpt_eval
from ultravox.evaluation import gpt_eval_boolq
from ultravox.evaluation import gpt_eval_convo
from ultravox.evaluation import gpt_eval_instruct
from ultravox.evaluation import string_based
from ultravox.evaluation import wer

Expand All @@ -8,11 +10,11 @@ def evaluate_answer(sample: eval_types.Sample, metric: str) -> eval_types.Result
if metric == "asr":
return wer.evaluate_answer_asr(sample)
elif metric == "boolq":
return gpt_eval.evaluate_answer_boolq(sample)
return gpt_eval_boolq.evaluate_answer_boolq(sample)
elif metric == "instruct":
return gpt_eval.evaluate_answer_instruct(sample)
return gpt_eval_instruct.evaluate_answer_instruct(sample)
elif metric == "conversation":
return gpt_eval.evaluate_conversation_response(sample)
return gpt_eval_convo.evaluate_conversation_response(sample)
elif metric == "exact_match_last_word":
return string_based.match_last_word(sample)
else:
Expand Down
2 changes: 1 addition & 1 deletion ultravox/evaluation/eval_types.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import dataclasses
from typing import List, Optional, Union, Dict
from typing import Dict, List, Optional, Union

import dataclasses_json

Expand Down
72 changes: 0 additions & 72 deletions ultravox/evaluation/gpt_eval.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,63 +6,6 @@

from ultravox.evaluation import eval_types

INSTRUCT_SYSTEM_PROMPT = f"""
You are an expert evaluator of AI systems.
Given a question with a specified instruction, you will be rating the correctness of an AI model's ability to follow that instruction.
Based on the supplied answer, and exemplary (correct) answer, you will rate the model's answer as either correct or incorrect.
Award 1 point if the model followed the instruction, and 0 points if it did not.
For example, given a question with an instruction of "Write a sentence about pickleball",
- if the model responds "Pickleball is a tennis-like game played with a wiffle ball.", you should award 1 point.
- if the model responds "Pickleball is a type of fruit", you should award 0 points.
- if the model responds with something off-topic or nonsensical, you should award 0 points.
Your response MUST start with either 0 or 1, followed by a space, and then an explanation for why you awarded that score.
"""
INSTRUCT_USER_PROMPT = """
Using the supplied correct answer as an example, evaluate the model's ability to follow the instructions in the question below:
Question: {{ question }}
Model answer: {{ generated_answer }}
Correct answer: {{ expected_answer }}
"""


BOOLQ_SYSTEM_PROMPT = f"""
You are an expert evaluator of AI systems.
Given a question with a known true/false answer, you will be rating the correctness of an AI model's answer to that same question.
Based on the supplied question, answer, and expected (correct) answer, you will rate the model's answer as either correct or incorrect.
Award 1 point if the model's answer matches the correct answer, and 0 points if the model's answer does not match, or cannot be converted to a true/false verdict.
Model answers of the form "True", "Yes", "Yeah", etc., should be considered to match a True answer.
Model answers of the form "False", "No", "Incorrect", etc., should be considered to match a False answer.
Only use the supplied correct answer to make your decision; DO NOT use your own knowledge to determine correctness.
Your response MUST start with either 0 or 1, followed by a space, and then a brief explanation for why you awarded that score.
"""
BOOLQ_USER_PROMPT = """
Using the supplied correct answer as ground truth, evaluate the model's answer to the question below:
Question: {{ question }}
Model answer: {{ generated_answer }}
Correct answer: {{ expected_answer }}
"""


CONVO_SYSTEM_PROMPT = f"""
You are an expert evaluator of conversational AI systems.
Given a conversation between two parties, the role of the AI system was to follow the flow of the conversation and respond appropriately.
You are given the conversation, the AI model's response, and an exemplary (correct) response.
The AI model response might be truncated, but that should not affect your evaluation.
Your should award 1 point if the model's response is appropriate and follows the conversation, and 0 points if it does not, such as being off-topic or nonsensical.
Your response MUST start with either 0 or 1, followed by a space, and then an explanation for why you awarded that score.
"""

CONVO_USER_PROMPT = """
Using the supplied example of a correct answer, evaluate the model's ability to follow the flow of the conversation in the last message:
Conversation:
{%- for turn in history + [ {"role": "user", "content": question} ] %}
{% if turn["role"] == "user" %}A{% else %}B{% endif %}: {{ turn["content"] }}
{% endfor %}
Model (as B): {{ generated_answer }}
Correct: {{ expected_answer }}
"""

RATING_MODEL = "gpt-4o"
client: Optional[openai.Client] = None

Expand Down Expand Up @@ -97,18 +40,3 @@ def _evaluate_answer_gpt(
pass

return eval_types.InstructResult(score=score, reason=rating_text[2:])


def evaluate_answer_boolq(sample: eval_types.Sample) -> eval_types.InstructResult:
return _evaluate_answer_gpt(BOOLQ_SYSTEM_PROMPT, BOOLQ_USER_PROMPT, sample)


def evaluate_answer_instruct(sample: eval_types.Sample) -> eval_types.InstructResult:
return _evaluate_answer_gpt(INSTRUCT_SYSTEM_PROMPT, INSTRUCT_USER_PROMPT, sample)


def evaluate_conversation_response(
sample: eval_types.Sample,
) -> eval_types.InstructResult:
sample.history = [msg for msg in sample.history if msg["role"] != "system"]
return _evaluate_answer_gpt(CONVO_SYSTEM_PROMPT, CONVO_USER_PROMPT, sample)
23 changes: 23 additions & 0 deletions ultravox/evaluation/gpt_eval_boolq.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
from ultravox.evaluation import eval_types
from ultravox.evaluation import gpt_eval

BOOLQ_SYSTEM_PROMPT = f"""
You are an expert evaluator of AI systems.
Given a question with a known true/false answer, you will be rating the correctness of an AI model's answer to that same question.
Based on the supplied question, answer, and expected (correct) answer, you will rate the model's answer as either correct or incorrect.
Award 1 point if the model's answer matches the correct answer, and 0 points if the model's answer does not match, or cannot be converted to a true/false verdict.
Model answers of the form "True", "Yes", "Yeah", etc., should be considered to match a True answer.
Model answers of the form "False", "No", "Incorrect", etc., should be considered to match a False answer.
Only use the supplied correct answer to make your decision; DO NOT use your own knowledge to determine correctness.
Your response MUST start with either 0 or 1, followed by a space, and then a brief explanation for why you awarded that score.
"""
BOOLQ_USER_PROMPT = """
Using the supplied correct answer as ground truth, evaluate the model's answer to the question below:
Question: {{ question }}
Model answer: {{ generated_answer }}
Correct answer: {{ expected_answer }}
"""


def evaluate_answer_boolq(sample: eval_types.Sample) -> eval_types.InstructResult:
return gpt_eval._evaluate_answer_gpt(BOOLQ_SYSTEM_PROMPT, BOOLQ_USER_PROMPT, sample)
29 changes: 29 additions & 0 deletions ultravox/evaluation/gpt_eval_conv.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
from ultravox.evaluation import eval_types
from ultravox.evaluation import gpt_eval

CONVO_SYSTEM_PROMPT = f"""
You are an expert evaluator of conversational AI systems.
Given a conversation between two parties, the role of the AI system was to follow the flow of the conversation and respond appropriately.
You are given the conversation, the AI model's response, and an exemplary (correct) response.
The AI model response might be truncated, but that should not affect your evaluation.
Your should award 1 point if the model's response is appropriate and follows the conversation, and 0 points if it does not, such as being off-topic or nonsensical.
Your response MUST start with either 0 or 1, followed by a space, and then an explanation for why you awarded that score.
"""

CONVO_USER_PROMPT = """
Using the supplied example of a correct answer, evaluate the model's ability to follow the flow of the conversation in the last message:
Conversation:
{%- for turn in history + [ {"role": "user", "content": question} ] %}
{% if turn["role"] == "user" %}A{% else %}B{% endif %}: {{ turn["content"] }}
{% endfor %}
Model (as B): {{ generated_answer }}
Correct: {{ expected_answer }}
"""


def evaluate_conversation_response(
sample: eval_types.Sample,
) -> eval_types.InstructResult:
sample.history = [msg for msg in sample.history if msg["role"] != "system"]
return gpt_eval._evaluate_answer_gpt(CONVO_SYSTEM_PROMPT, CONVO_USER_PROMPT, sample)
26 changes: 26 additions & 0 deletions ultravox/evaluation/gpt_eval_instruct.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
from ultravox.evaluation import eval_types
from ultravox.evaluation import gpt_eval

INSTRUCT_SYSTEM_PROMPT = f"""
You are an expert evaluator of AI systems.
Given a question with a specified instruction, you will be rating the correctness of an AI model's ability to follow that instruction.
Based on the supplied answer, and exemplary (correct) answer, you will rate the model's answer as either correct or incorrect.
Award 1 point if the model followed the instruction, and 0 points if it did not.
For example, given a question with an instruction of "Write a sentence about pickleball",
- if the model responds "Pickleball is a tennis-like game played with a wiffle ball.", you should award 1 point.
- if the model responds "Pickleball is a type of fruit", you should award 0 points.
- if the model responds with something off-topic or nonsensical, you should award 0 points.
Your response MUST start with either 0 or 1, followed by a space, and then an explanation for why you awarded that score.
"""
INSTRUCT_USER_PROMPT = """
Using the supplied correct answer as an example, evaluate the model's ability to follow the instructions in the question below:
Question: {{ question }}
Model answer: {{ generated_answer }}
Correct answer: {{ expected_answer }}
"""


def evaluate_answer_instruct(sample: eval_types.Sample) -> eval_types.InstructResult:
return gpt_eval._evaluate_answer_gpt(
INSTRUCT_SYSTEM_PROMPT, INSTRUCT_USER_PROMPT, sample
)
31 changes: 31 additions & 0 deletions ultravox/evaluation/gpt_eval_test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
import re
from unittest import mock

from ultravox.evaluation import eval_types
from ultravox.evaluation import gpt_eval
from ultravox.evaluation import gpt_eval_conv


def test_evaluate_conversation():
gpt_eval.client = mock.MagicMock()
sample = eval_types.Sample(
history=[
{"role": "system", "content": "Blah blah blah"},
{"role": "user", "content": "T1"},
{"role": "assistant", "content": "T2"},
],
question="T3",
generated_answer="T4",
expected_answer="EXP",
)
expected_turns = "A: T1\n\nB: T2\n\nA: T3\n\nModel (as B): T4\nCorrect: EXP"

gpt_eval_conv.evaluate_conversation_response(sample)

completion_args = gpt_eval.client.chat.completions.create.call_args[1]
assert len(completion_args["messages"]) == 2
assert completion_args["messages"][0]["role"] == "system"
assert completion_args["messages"][1]["role"] == "user"
gpt_question = re.sub("\n *", "\n", completion_args["messages"][1]["content"])
assert expected_turns in gpt_question
assert "Blah blah blah" not in gpt_question

0 comments on commit 5ba015c

Please sign in to comment.