separate gpt_evals + test for conv eval

fixie-ai · Jun 25, 2024 · 5ba015c · 5ba015c
1 parent 87aaf19
commit 5ba015c
Show file tree

Hide file tree

Showing 7 changed files with 116 additions and 77 deletions.
diff --git a/ultravox/evaluation/eval.py b/ultravox/evaluation/eval.py
@@ -1,5 +1,7 @@
 from ultravox.evaluation import eval_types
-from ultravox.evaluation import gpt_eval
+from ultravox.evaluation import gpt_eval_boolq
+from ultravox.evaluation import gpt_eval_convo
+from ultravox.evaluation import gpt_eval_instruct
 from ultravox.evaluation import string_based
 from ultravox.evaluation import wer
 
@@ -8,11 +10,11 @@ def evaluate_answer(sample: eval_types.Sample, metric: str) -> eval_types.Result
     if metric == "asr":
         return wer.evaluate_answer_asr(sample)
     elif metric == "boolq":
-        return gpt_eval.evaluate_answer_boolq(sample)
+        return gpt_eval_boolq.evaluate_answer_boolq(sample)
     elif metric == "instruct":
-        return gpt_eval.evaluate_answer_instruct(sample)
+        return gpt_eval_instruct.evaluate_answer_instruct(sample)
     elif metric == "conversation":
-        return gpt_eval.evaluate_conversation_response(sample)
+        return gpt_eval_convo.evaluate_conversation_response(sample)
     elif metric == "exact_match_last_word":
         return string_based.match_last_word(sample)
     else:

diff --git a/ultravox/evaluation/eval_types.py b/ultravox/evaluation/eval_types.py
@@ -1,5 +1,5 @@
 import dataclasses
-from typing import List, Optional, Union, Dict
+from typing import Dict, List, Optional, Union
 
 import dataclasses_json
 

diff --git a/ultravox/evaluation/gpt_eval.py b/ultravox/evaluation/gpt_eval.py
@@ -6,63 +6,6 @@
 
 from ultravox.evaluation import eval_types
 
-INSTRUCT_SYSTEM_PROMPT = f"""
-You are an expert evaluator of AI systems.
-Given a question with a specified instruction, you will be rating the correctness of an AI model's ability to follow that instruction.
-Based on the supplied answer, and exemplary (correct) answer, you will rate the model's answer as either correct or incorrect.
-Award 1 point if the model followed the instruction, and 0 points if it did not.
-For example, given a question with an instruction of "Write a sentence about pickleball",
-- if the model responds "Pickleball is a tennis-like game played with a wiffle ball.", you should award 1 point.
-- if the model responds "Pickleball is a type of fruit", you should award 0 points.
-- if the model responds with something off-topic or nonsensical, you should award 0 points.
-Your response MUST start with either 0 or 1, followed by a space, and then an explanation for why you awarded that score.
-"""
-INSTRUCT_USER_PROMPT = """
-Using the supplied correct answer as an example, evaluate the model's ability to follow the instructions in the question below:
-Question: {{ question }}
-Model answer: {{ generated_answer }}
-Correct answer: {{ expected_answer }}
-"""
-
-
-BOOLQ_SYSTEM_PROMPT = f"""
-You are an expert evaluator of AI systems.
-Given a question with a known true/false answer, you will be rating the correctness of an AI model's answer to that same question.
-Based on the supplied question, answer, and expected (correct) answer, you will rate the model's answer as either correct or incorrect.
-Award 1 point if the model's answer matches the correct answer, and 0 points if the model's answer does not match, or cannot be converted to a true/false verdict.
-Model answers of the form "True", "Yes", "Yeah", etc., should be considered to match a True answer.
-Model answers of the form "False", "No", "Incorrect", etc., should be considered to match a False answer.
-Only use the supplied correct answer to make your decision; DO NOT use your own knowledge to determine correctness.
-Your response MUST start with either 0 or 1, followed by a space, and then a brief explanation for why you awarded that score.
-"""
-BOOLQ_USER_PROMPT = """
-Using the supplied correct answer as ground truth, evaluate the model's answer to the question below:
-Question: {{ question }}
-Model answer: {{ generated_answer }}
-Correct answer: {{ expected_answer }}
-"""
-
-
-CONVO_SYSTEM_PROMPT = f"""
-You are an expert evaluator of conversational AI systems.
-Given a conversation between two parties, the role of the AI system was to follow the flow of the conversation and respond appropriately.
-You are given the conversation, the AI model's response, and an exemplary (correct) response.
-The AI model response might be truncated, but that should not affect your evaluation.
-Your should award 1 point if the model's response is appropriate and follows the conversation, and 0 points if it does not, such as being off-topic or nonsensical.
-Your response MUST start with either 0 or 1, followed by a space, and then an explanation for why you awarded that score.
-"""
-
-CONVO_USER_PROMPT = """
-Using the supplied example of a correct answer, evaluate the model's ability to follow the flow of the conversation in the last message:
-
-Conversation:
-{%- for turn in history + [ {"role": "user", "content": question} ] %}
-    {% if turn["role"] == "user" %}A{% else %}B{% endif %}: {{ turn["content"] }}
-{% endfor %}
-    Model (as B): {{ generated_answer }}
-    Correct: {{ expected_answer }}
-"""
-
 RATING_MODEL = "gpt-4o"
 client: Optional[openai.Client] = None
 
@@ -97,18 +40,3 @@ def _evaluate_answer_gpt(
         pass
 
     return eval_types.InstructResult(score=score, reason=rating_text[2:])
-
-
-def evaluate_answer_boolq(sample: eval_types.Sample) -> eval_types.InstructResult:
-    return _evaluate_answer_gpt(BOOLQ_SYSTEM_PROMPT, BOOLQ_USER_PROMPT, sample)
-
-
-def evaluate_answer_instruct(sample: eval_types.Sample) -> eval_types.InstructResult:
-    return _evaluate_answer_gpt(INSTRUCT_SYSTEM_PROMPT, INSTRUCT_USER_PROMPT, sample)
-
-
-def evaluate_conversation_response(
-    sample: eval_types.Sample,
-) -> eval_types.InstructResult:
-    sample.history = [msg for msg in sample.history if msg["role"] != "system"]
-    return _evaluate_answer_gpt(CONVO_SYSTEM_PROMPT, CONVO_USER_PROMPT, sample)
diff --git a/ultravox/evaluation/gpt_eval_boolq.py b/ultravox/evaluation/gpt_eval_boolq.py
@@ -0,0 +1,23 @@
+from ultravox.evaluation import eval_types
+from ultravox.evaluation import gpt_eval
+
+BOOLQ_SYSTEM_PROMPT = f"""
+You are an expert evaluator of AI systems.
+Given a question with a known true/false answer, you will be rating the correctness of an AI model's answer to that same question.
+Based on the supplied question, answer, and expected (correct) answer, you will rate the model's answer as either correct or incorrect.
+Award 1 point if the model's answer matches the correct answer, and 0 points if the model's answer does not match, or cannot be converted to a true/false verdict.
+Model answers of the form "True", "Yes", "Yeah", etc., should be considered to match a True answer.
+Model answers of the form "False", "No", "Incorrect", etc., should be considered to match a False answer.
+Only use the supplied correct answer to make your decision; DO NOT use your own knowledge to determine correctness.
+Your response MUST start with either 0 or 1, followed by a space, and then a brief explanation for why you awarded that score.
+"""
+BOOLQ_USER_PROMPT = """
+Using the supplied correct answer as ground truth, evaluate the model's answer to the question below:
+Question: {{ question }}
+Model answer: {{ generated_answer }}
+Correct answer: {{ expected_answer }}
+"""
+
+
+def evaluate_answer_boolq(sample: eval_types.Sample) -> eval_types.InstructResult:
+    return gpt_eval._evaluate_answer_gpt(BOOLQ_SYSTEM_PROMPT, BOOLQ_USER_PROMPT, sample)
diff --git a/ultravox/evaluation/gpt_eval_conv.py b/ultravox/evaluation/gpt_eval_conv.py
@@ -0,0 +1,29 @@
+from ultravox.evaluation import eval_types
+from ultravox.evaluation import gpt_eval
+
+CONVO_SYSTEM_PROMPT = f"""
+You are an expert evaluator of conversational AI systems.
+Given a conversation between two parties, the role of the AI system was to follow the flow of the conversation and respond appropriately.
+You are given the conversation, the AI model's response, and an exemplary (correct) response.
+The AI model response might be truncated, but that should not affect your evaluation.
+Your should award 1 point if the model's response is appropriate and follows the conversation, and 0 points if it does not, such as being off-topic or nonsensical.
+Your response MUST start with either 0 or 1, followed by a space, and then an explanation for why you awarded that score.
+"""
+
+CONVO_USER_PROMPT = """
+Using the supplied example of a correct answer, evaluate the model's ability to follow the flow of the conversation in the last message:
+
+Conversation:
+{%- for turn in history + [ {"role": "user", "content": question} ] %}
+    {% if turn["role"] == "user" %}A{% else %}B{% endif %}: {{ turn["content"] }}
+{% endfor %}
+    Model (as B): {{ generated_answer }}
+    Correct: {{ expected_answer }}
+"""
+
+
+def evaluate_conversation_response(
+    sample: eval_types.Sample,
+) -> eval_types.InstructResult:
+    sample.history = [msg for msg in sample.history if msg["role"] != "system"]
+    return gpt_eval._evaluate_answer_gpt(CONVO_SYSTEM_PROMPT, CONVO_USER_PROMPT, sample)
diff --git a/ultravox/evaluation/gpt_eval_instruct.py b/ultravox/evaluation/gpt_eval_instruct.py
@@ -0,0 +1,26 @@
+from ultravox.evaluation import eval_types
+from ultravox.evaluation import gpt_eval
+
+INSTRUCT_SYSTEM_PROMPT = f"""
+You are an expert evaluator of AI systems.
+Given a question with a specified instruction, you will be rating the correctness of an AI model's ability to follow that instruction.
+Based on the supplied answer, and exemplary (correct) answer, you will rate the model's answer as either correct or incorrect.
+Award 1 point if the model followed the instruction, and 0 points if it did not.
+For example, given a question with an instruction of "Write a sentence about pickleball",
+- if the model responds "Pickleball is a tennis-like game played with a wiffle ball.", you should award 1 point.
+- if the model responds "Pickleball is a type of fruit", you should award 0 points.
+- if the model responds with something off-topic or nonsensical, you should award 0 points.
+Your response MUST start with either 0 or 1, followed by a space, and then an explanation for why you awarded that score.
+"""
+INSTRUCT_USER_PROMPT = """
+Using the supplied correct answer as an example, evaluate the model's ability to follow the instructions in the question below:
+Question: {{ question }}
+Model answer: {{ generated_answer }}
+Correct answer: {{ expected_answer }}
+"""
+
+
+def evaluate_answer_instruct(sample: eval_types.Sample) -> eval_types.InstructResult:
+    return gpt_eval._evaluate_answer_gpt(
+        INSTRUCT_SYSTEM_PROMPT, INSTRUCT_USER_PROMPT, sample
+    )
diff --git a/ultravox/evaluation/gpt_eval_test.py b/ultravox/evaluation/gpt_eval_test.py
@@ -0,0 +1,31 @@
+import re
+from unittest import mock
+
+from ultravox.evaluation import eval_types
+from ultravox.evaluation import gpt_eval
+from ultravox.evaluation import gpt_eval_conv
+
+
+def test_evaluate_conversation():
+    gpt_eval.client = mock.MagicMock()
+    sample = eval_types.Sample(
+        history=[
+            {"role": "system", "content": "Blah blah blah"},
+            {"role": "user", "content": "T1"},
+            {"role": "assistant", "content": "T2"},
+        ],
+        question="T3",
+        generated_answer="T4",
+        expected_answer="EXP",
+    )
+    expected_turns = "A: T1\n\nB: T2\n\nA: T3\n\nModel (as B): T4\nCorrect: EXP"
+
+    gpt_eval_conv.evaluate_conversation_response(sample)
+
+    completion_args = gpt_eval.client.chat.completions.create.call_args[1]
+    assert len(completion_args["messages"]) == 2
+    assert completion_args["messages"][0]["role"] == "system"
+    assert completion_args["messages"][1]["role"] == "user"
+    gpt_question = re.sub("\n *", "\n", completion_args["messages"][1]["content"])
+    assert expected_turns in gpt_question
+    assert "Blah blah blah" not in gpt_question