fixie-ai · farzadab · Jul 8, 2024 · Jun 19, 2024 · Jun 20, 2024 · Jun 21, 2024
diff --git a/ultravox/data/datasets.py b/ultravox/data/datasets.py
@@ -305,15 +305,17 @@ def _get_transcribe_messages(self, idx: int, text: str) -> List[Dict[str, str]]:
             {"role": "assistant", "content": text},
         ]
 
-    def _get_audio(self, row: transformers.BatchFeature) -> np.ndarray:
+    def _get_audio(
+        self, row: transformers.BatchFeature, column_name: str = "audio"
+    ) -> np.ndarray:
         # Hugging Face datasets have an Audio object, with array and sampling_rate fields.
         # For MDS, this object is flattened into audio_array and audio_sampling_rate fields.
-        if "audio" in row:
-            audio = row["audio"]["array"]
-            sampling_rate = row["audio"]["sampling_rate"]
-        elif "audio_array" in row:
-            audio = row["audio_array"]
-            sampling_rate = row["audio_sampling_rate"]
+        if column_name in row:
+            audio = row[column_name]["array"]
+            sampling_rate = row[column_name]["sampling_rate"]
+        elif f"{column_name}_array" in row:
+            audio = row[f"{column_name}_array"]
+            sampling_rate = row[f"{column_name}_sampling_rate"]
         else:
             raise ValueError("No audio field found in row.")
         assert sampling_rate == SAMPLE_RATE
@@ -681,6 +683,51 @@ def _get_sample(self, idx, row) -> VoiceSample:
         return self._get_transcribe_sample(idx, row, tcol="text")
 
 
+class SodaDataset(VoiceDataset):
+    SYS_PROMPTS = [
+        "Follow the flow of the conversation and respond just like a human would in the same situation.",
+        "Engage in the conversation naturally, responding as a human would.",
+        "Follow the dialogue and reply like a person in that situation.",
+        "Participate in the chat and answer as if you were a human.",
+        "Interact smoothly and respond just like a person would.",
+        "Stay in the moment and reply as a human would in the conversation.",
+        "Flow with the discussion and respond naturally, as a person would.",
+        "Keep the dialogue going and answer like a human would.",
+        "Follow along and reply in a way a person would in the chat.",
+        "Stay engaged in the conversation and respond like a human.",
+        "Maintain the flow of the chat and answer just as a person would.",
+    ]
+
+    def __init__(self, args: VoiceDatasetArgs) -> None:
+        super().__init__(args)
+        dataset = self._load_audio_dataset(
+            "fixie-ai/soda-audio", split=args.split.value
+        )
+        self._init_dataset(dataset)
+
+    def _get_sample(self, idx, row) -> VoiceSample:
+        turns = row["dialogue"]
+        # Make sure the last turn is the assistant's
+        roles = ["user", "assistant"] if len(turns) % 2 == 0 else ["assistant", "user"]
+
+        num_prompts = min(self._args.num_prompts, len(self.SYS_PROMPTS))
+        sys_prompt = self.SYS_PROMPTS[idx % num_prompts]
+
+        messages = [{"role": "system", "content": sys_prompt}]
+        messages += [
+            {"role": roles[i % 2], "content": turn} for i, turn in enumerate(turns)
+        ]
+        messages[-1]["content"] = row["alt_last_turn"]
+        if self._args.include_audio:
+            messages[-2]["content"] = "<|audio|>"
+
+        return self._make_sample(
+            messages,
+            audio=self._get_audio(row, "audio_second_last_turn"),
+            audio_transcript=turns[-2],
+        )
+
+
 def create_dataset(name: str, args: VoiceDatasetArgs) -> data.IterableDataset:
     DATASET_MAP: Dict[str, Any] = {
         "anyinstruct": AnyInstructAnswerDataset,
@@ -694,6 +741,7 @@ def create_dataset(name: str, args: VoiceDatasetArgs) -> data.IterableDataset:
         "voxpopuli": VoxPopuliDataset,
         "commonvoice": CommonVoiceDataset,
         "peoplespeech": PeopleSpeechDataset,
+        "soda": SodaDataset,
         "dummy": LibriSpeechDummyDataset,
     }
     return DATASET_MAP[name](args)

diff --git a/ultravox/evaluation/eval.py b/ultravox/evaluation/eval.py
@@ -1,5 +1,7 @@
 from ultravox.evaluation import eval_types
-from ultravox.evaluation import gpt_eval
+from ultravox.evaluation import gpt_eval_boolq
+from ultravox.evaluation import gpt_eval_conv
+from ultravox.evaluation import gpt_eval_instruct
 from ultravox.evaluation import string_based
 from ultravox.evaluation import wer
 
@@ -8,9 +10,11 @@ def evaluate_answer(sample: eval_types.Sample, metric: str) -> eval_types.Result
     if metric == "asr":
         return wer.evaluate_answer_asr(sample)
     elif metric == "boolq":
-        return gpt_eval.evaluate_answer_boolq(sample)
+        return gpt_eval_boolq.evaluate_answer_boolq(sample)
     elif metric == "instruct":
-        return gpt_eval.evaluate_answer_instruct(sample)
+        return gpt_eval_instruct.evaluate_answer_instruct(sample)
+    elif metric == "conversation":
+        return gpt_eval_conv.evaluate_conversation_response(sample)
     elif metric == "exact_match_last_word":
         return string_based.match_last_word(sample)
     else:

diff --git a/ultravox/evaluation/eval_types.py b/ultravox/evaluation/eval_types.py
@@ -1,5 +1,5 @@
 import dataclasses
-from typing import Optional, Union
+from typing import Dict, List, Optional, Union
 
 import dataclasses_json
 
@@ -9,6 +9,7 @@ class Sample(dataclasses_json.DataClassJsonMixin):
     question: str
     generated_answer: str
     expected_answer: str
+    history: List[Dict[str, str]] = dataclasses.field(default_factory=list)
 
 
 @dataclasses.dataclass

diff --git a/ultravox/evaluation/gpt_eval.py b/ultravox/evaluation/gpt_eval.py
@@ -1,46 +1,11 @@
 import dataclasses
 from typing import Optional
 
+import jinja2
 import openai
 
 from ultravox.evaluation import eval_types
 
-INSTRUCT_SYSTEM_PROMPT = f"""
-You are an expert evaluator of AI systems.
-Given a question with a specified instruction, you will be rating the correctness of an AI model's ability to follow that instruction.
-Based on the supplied answer, and exemplary (correct) answer, you will rate the model's answer as either correct or incorrect.
-Award 1 point if the model followed the instruction, and 0 points if it did not.
-For example, given a question with an instruction of "Write a sentence about pickleball",
-- if the model responds "Pickleball is a tennis-like game played with a wiffle ball.", you should award 1 point.
-- if the model responds "Pickleball is a type of fruit", you should award 0 points.
-- if the model responds with something off-topic or nonsensical, you should award 0 points.
-Your response MUST start with either 0 or 1, followed by a space, and then an explanation for why you awarded that score.
-"""
-INSTRUCT_USER_PROMPT = """
-Using the supplied correct answer as an example, evaluate the model's ability to follow the instructions in the question below:
-Question: {question}
-Model answer: {generated_answer}
-Correct answer: {expected_answer}
-"""
-
-
-BOOLQ_SYSTEM_PROMPT = f"""
-You are an expert evaluator of AI systems.
-Given a question with a known true/false answer, you will be rating the correctness of an AI model's answer to that same question.
-Based on the supplied question, answer, and expected (correct) answer, you will rate the model's answer as either correct or incorrect.
-Award 1 point if the model's answer matches the correct answer, and 0 points if the model's answer does not match, or cannot be converted to a true/false verdict.
-Model answers of the form "True", "Yes", "Yeah", etc., should be considered to match a True answer.
-Model answers of the form "False", "No", "Incorrect", etc., should be considered to match a False answer.
-Only use the supplied correct answer to make your decision; DO NOT use your own knowledge to determine correctness.
-Your response MUST start with either 0 or 1, followed by a space, and then a brief explanation for why you awarded that score.
-"""
-BOOLQ_USER_PROMPT = """
-Using the supplied correct answer as ground truth, evaluate the model's answer to the question below:
-Question: {question}
-Model answer: {generated_answer}
-Correct answer: {expected_answer}
-"""
-
 RATING_MODEL = "gpt-4o"
 client: Optional[openai.Client] = None
 
@@ -51,13 +16,14 @@ def _evaluate_answer_gpt(
     global client
     if client is None:
         client = openai.Client()
+    template = jinja2.Template(user_prompt)
     response = client.chat.completions.create(
         model=RATING_MODEL,
         messages=[
             {"role": "system", "content": sys_prompt},
             {
                 "role": "user",
-                "content": user_prompt.format(**dataclasses.asdict(sample)),
+                "content": template.render(**dataclasses.asdict(sample)),
             },
         ],
         max_tokens=50,
@@ -74,11 +40,3 @@ def _evaluate_answer_gpt(
         pass
 
     return eval_types.InstructResult(score=score, reason=rating_text[2:])
-
-
-def evaluate_answer_boolq(sample: eval_types.Sample) -> eval_types.InstructResult:
-    return _evaluate_answer_gpt(BOOLQ_SYSTEM_PROMPT, BOOLQ_USER_PROMPT, sample)
-
-
-def evaluate_answer_instruct(sample: eval_types.Sample) -> eval_types.InstructResult:
-    return _evaluate_answer_gpt(INSTRUCT_SYSTEM_PROMPT, INSTRUCT_USER_PROMPT, sample)
diff --git a/ultravox/evaluation/gpt_eval_boolq.py b/ultravox/evaluation/gpt_eval_boolq.py
@@ -0,0 +1,23 @@
+from ultravox.evaluation import eval_types
+from ultravox.evaluation import gpt_eval
+
+BOOLQ_SYSTEM_PROMPT = f"""
+You are an expert evaluator of AI systems.
+Given a question with a known true/false answer, you will be rating the correctness of an AI model's answer to that same question.
+Based on the supplied question, answer, and expected (correct) answer, you will rate the model's answer as either correct or incorrect.
+Award 1 point if the model's answer matches the correct answer, and 0 points if the model's answer does not match, or cannot be converted to a true/false verdict.
+Model answers of the form "True", "Yes", "Yeah", etc., should be considered to match a True answer.
+Model answers of the form "False", "No", "Incorrect", etc., should be considered to match a False answer.
+Only use the supplied correct answer to make your decision; DO NOT use your own knowledge to determine correctness.
+Your response MUST start with either 0 or 1, followed by a space, and then a brief explanation for why you awarded that score.
+"""
+BOOLQ_USER_PROMPT = """
+Using the supplied correct answer as ground truth, evaluate the model's answer to the question below:
+Question: {{ question }}
+Model answer: {{ generated_answer }}
+Correct answer: {{ expected_answer }}
+"""
+
+
+def evaluate_answer_boolq(sample: eval_types.Sample) -> eval_types.InstructResult:
+    return gpt_eval._evaluate_answer_gpt(BOOLQ_SYSTEM_PROMPT, BOOLQ_USER_PROMPT, sample)
diff --git a/ultravox/evaluation/gpt_eval_conv.py b/ultravox/evaluation/gpt_eval_conv.py
@@ -0,0 +1,29 @@
+from ultravox.evaluation import eval_types
+from ultravox.evaluation import gpt_eval
+
+CONVO_SYSTEM_PROMPT = f"""
+You are an expert evaluator of conversational AI systems.
+Given a conversation between two parties, the role of the AI system was to follow the flow of the conversation and respond appropriately.
+You are given the conversation, the AI model's response, and an exemplary (correct) response.
+The AI model response might be truncated, but that should not affect your evaluation.
+Your should award 1 point if the model's response is appropriate and follows the conversation, and 0 points if it does not, such as being off-topic or nonsensical.
+Your response MUST start with either 0 or 1, followed by a space, and then an explanation for why you awarded that score.
+"""
+
+CONVO_USER_PROMPT = """
+Using the supplied example of a correct answer, evaluate the model's ability to follow the flow of the conversation in the last message:
+
+Conversation:
+{%- for turn in history + [ {"role": "user", "content": question} ] %}
+    {% if turn["role"] == "user" %}A{% else %}B{% endif %}: {{ turn["content"] }}
+{% endfor %}
+    Model (as B): {{ generated_answer }}
+    Correct: {{ expected_answer }}
+"""
+
+
+def evaluate_conversation_response(
+    sample: eval_types.Sample,
+) -> eval_types.InstructResult:
+    sample.history = [msg for msg in sample.history if msg["role"] != "system"]
+    return gpt_eval._evaluate_answer_gpt(CONVO_SYSTEM_PROMPT, CONVO_USER_PROMPT, sample)
diff --git a/ultravox/evaluation/gpt_eval_instruct.py b/ultravox/evaluation/gpt_eval_instruct.py
@@ -0,0 +1,26 @@
+from ultravox.evaluation import eval_types
+from ultravox.evaluation import gpt_eval
+
+INSTRUCT_SYSTEM_PROMPT = f"""
+You are an expert evaluator of AI systems.
+Given a question with a specified instruction, you will be rating the correctness of an AI model's ability to follow that instruction.
+Based on the supplied answer, and exemplary (correct) answer, you will rate the model's answer as either correct or incorrect.
+Award 1 point if the model followed the instruction, and 0 points if it did not.
+For example, given a question with an instruction of "Write a sentence about pickleball",
+- if the model responds "Pickleball is a tennis-like game played with a wiffle ball.", you should award 1 point.
+- if the model responds "Pickleball is a type of fruit", you should award 0 points.
+- if the model responds with something off-topic or nonsensical, you should award 0 points.
+Your response MUST start with either 0 or 1, followed by a space, and then an explanation for why you awarded that score.
+"""
+INSTRUCT_USER_PROMPT = """
+Using the supplied correct answer as an example, evaluate the model's ability to follow the instructions in the question below:
+Question: {{ question }}
+Model answer: {{ generated_answer }}
+Correct answer: {{ expected_answer }}
+"""
+
+
+def evaluate_answer_instruct(sample: eval_types.Sample) -> eval_types.InstructResult:
+    return gpt_eval._evaluate_answer_gpt(
+        INSTRUCT_SYSTEM_PROMPT, INSTRUCT_USER_PROMPT, sample
+    )
diff --git a/ultravox/evaluation/gpt_eval_test.py b/ultravox/evaluation/gpt_eval_test.py
@@ -0,0 +1,31 @@
+import re
+from unittest import mock
+
+from ultravox.evaluation import eval_types
+from ultravox.evaluation import gpt_eval
+from ultravox.evaluation import gpt_eval_conv
+
+
+def test_evaluate_conversation():
+    gpt_eval.client = mock.MagicMock()
+    sample = eval_types.Sample(
+        history=[
+            {"role": "system", "content": "Blah blah blah"},
+            {"role": "user", "content": "T1"},
+            {"role": "assistant", "content": "T2"},
+        ],
+        question="T3",
+        generated_answer="T4",
+        expected_answer="EXP",
+    )
+    expected_turns = "A: T1\n\nB: T2\n\nA: T3\n\nModel (as B): T4\nCorrect: EXP"
+
+    gpt_eval_conv.evaluate_conversation_response(sample)
+
+    completion_args = gpt_eval.client.chat.completions.create.call_args[1]
+    assert len(completion_args["messages"]) == 2
+    assert completion_args["messages"][0]["role"] == "system"
+    assert completion_args["messages"][1]["role"] == "user"
+    gpt_question = re.sub("\n *", "\n", completion_args["messages"][1]["content"])
+    assert expected_turns in gpt_question
+    assert "Blah blah blah" not in gpt_question
diff --git a/ultravox/tools/data_tool.py b/ultravox/tools/data_tool.py
@@ -38,11 +38,11 @@ def main(args: argparse.Namespace):
     for i, sample in enumerate(out_set):
         print(f"--- Sample {i} ---")
         messages = sample.messages
-        assert len(messages) == 2, f"Bad sample (messages) {len(messages)}"
-        assert messages[0]["role"] == "user", f"Bad sample (Q role): {messages}"
-        assert messages[1]["role"] == "assistant", f"Bad sample (A role): {messages}"
-        answer = messages[1]["content"].replace("\n", "\\n")
-        print(f"Q: {messages[0]['content']} [\"{sample.audio_transcript}\"]")
+        assert len(messages) >= 2, f"Bad sample (messages) {len(messages)}"
+        assert messages[-1]["role"] == "user", f"Bad sample (Q role): {messages}"
+        assert messages[-2]["role"] == "assistant", f"Bad sample (A role): {messages}"
+        answer = messages[-2]["content"].replace("\n", "\\n")
+        print(f"Q: {messages[-1]['content']} [\"{sample.audio_transcript}\"]")
         print(f"A: {answer}")
         if args.play:
             audio = sample.audio

diff --git a/ultravox/tools/infer_api.py b/ultravox/tools/infer_api.py
@@ -48,7 +48,7 @@ def infer_stream(
             headers["Authorization"] = f"Bearer {self._api_key}"
         data = {
             "model": self._model,
-            "messages": [self._build_message(sample)],
+            "messages": self._build_messages(sample),
             "stream": True,
         }
         if max_tokens is not None:
@@ -68,19 +68,27 @@ def infer_stream(
                         obj["usage"]["prompt_tokens"], obj["usage"]["completion_tokens"]
                     )
 
-    def _build_message(self, sample: datasets.VoiceSample):
+    def _build_messages(self, sample: datasets.VoiceSample):
+        """
+        Convert a VoiceSample into a list of messages for the OpenAI API.
+        This function assumes that if the sample has an audio field, it is in
+        the last message, indicated by a "<|audio|>" placeholder.
+
+        Audio is converted to a data URI and inserted into the message under an image_url type.
+        """
         if sample.audio is None:
-            return {"role": "user", "content": sample.messages[0]["content"]}
+            return sample
 
-        fragments = sample.messages[0]["content"].split("<|audio|>")
+        fragments = sample.messages[-1]["content"].split("<|audio|>")
         assert len(fragments) == 2, "Expected one <|audio|> placeholder"
         url = datasets.audio_to_data_uri(sample.audio, sample.sample_rate)
         parts = [
             {"type": "text", "text": fragments[0]},
             {"type": "image_url", "image_url": {"url": url}},
             {"type": "text", "text": fragments[1]},
         ]
-        return {"role": "user", "content": parts}
+        last_turn = {"role": "user", "content": parts}
+        return sample.messages[:-1] + [last_turn]
 
 
 class DatabricksInference(base.VoiceInference):
@@ -124,7 +132,9 @@ def infer(
         # audio as a file, not as a base64-encoded string. There's probably
         # a better way to do this, but I spent too much time on this already.
         # api = self._client.view_api(print_info=False, return_format="dict")
-        text = sample.messages[0]["content"]
+        text = sample.messages[0][
+            "content"
+        ]  # TODO: change regarding multiple messages?
         if self._url.startswith("https://demo.tincans.ai"):
             args: List[Any] = [text]
             if sample.audio is not None: