Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

SODA Dataset for Training #35

Merged
merged 17 commits into from
Jul 8, 2024
Merged
Show file tree
Hide file tree
Changes from 14 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
62 changes: 55 additions & 7 deletions ultravox/data/datasets.py
Original file line number Diff line number Diff line change
Expand Up @@ -305,15 +305,17 @@ def _get_transcribe_messages(self, idx: int, text: str) -> List[Dict[str, str]]:
{"role": "assistant", "content": text},
]

def _get_audio(self, row: transformers.BatchFeature) -> np.ndarray:
def _get_audio(
self, row: transformers.BatchFeature, column_name: str = "audio"
) -> np.ndarray:
# Hugging Face datasets have an Audio object, with array and sampling_rate fields.
# For MDS, this object is flattened into audio_array and audio_sampling_rate fields.
if "audio" in row:
audio = row["audio"]["array"]
sampling_rate = row["audio"]["sampling_rate"]
elif "audio_array" in row:
audio = row["audio_array"]
sampling_rate = row["audio_sampling_rate"]
if column_name in row:
audio = row[column_name]["array"]
sampling_rate = row[column_name]["sampling_rate"]
elif f"{column_name}_array" in row:
audio = row[f"{column_name}_array"]
sampling_rate = row[f"{column_name}_sampling_rate"]
else:
raise ValueError("No audio field found in row.")
assert sampling_rate == SAMPLE_RATE
Expand Down Expand Up @@ -681,6 +683,51 @@ def _get_sample(self, idx, row) -> VoiceSample:
return self._get_transcribe_sample(idx, row, tcol="text")


class SodaDataset(VoiceDataset):
SYS_PROMPTS = [
"Follow the flow of the conversation and respond just like a human would in the same situation.",
"Engage in the conversation naturally, responding as a human would.",
"Follow the dialogue and reply like a person in that situation.",
"Participate in the chat and answer as if you were a human.",
"Interact smoothly and respond just like a person would.",
"Stay in the moment and reply as a human would in the conversation.",
"Flow with the discussion and respond naturally, as a person would.",
"Keep the dialogue going and answer like a human would.",
"Follow along and reply in a way a person would in the chat.",
"Stay engaged in the conversation and respond like a human.",
"Maintain the flow of the chat and answer just as a person would.",
]

def __init__(self, args: VoiceDatasetArgs) -> None:
super().__init__(args)
dataset = self._load_audio_dataset(
"fixie-ai/soda-audio", split=args.split.value
)
self._init_dataset(dataset)

def _get_sample(self, idx, row) -> VoiceSample:
turns = row["dialogue"]
# Make sure the last turn is the assistant's
roles = ["user", "assistant"] if len(turns) % 2 == 0 else ["assistant", "user"]

num_prompts = min(self._args.num_prompts, len(self.SYS_PROMPTS))
sys_prompt = self.SYS_PROMPTS[idx % num_prompts]
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

did we end up using a RNG for this sort of thing rather than the index?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

(I forget where but we discussed adding a private RNG to datasets to allow them to simply pull a value from the RNG rather than using the index counter and various moduli)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, the idea was that we do that in the next PR.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Might as well just do it now I guess since I have the code.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done.


messages = [{"role": "system", "content": sys_prompt}]
messages += [
{"role": roles[i % 2], "content": turn} for i, turn in enumerate(turns)
]
messages[-1]["content"] = row["alt_last_turn"]
if self._args.include_audio:
messages[-2]["content"] = "<|audio|>"

return self._make_sample(
messages,
audio=self._get_audio(row, "audio_second_last_turn"),
audio_transcript=turns[-2],
)


def create_dataset(name: str, args: VoiceDatasetArgs) -> data.IterableDataset:
DATASET_MAP: Dict[str, Any] = {
"anyinstruct": AnyInstructAnswerDataset,
Expand All @@ -694,6 +741,7 @@ def create_dataset(name: str, args: VoiceDatasetArgs) -> data.IterableDataset:
"voxpopuli": VoxPopuliDataset,
"commonvoice": CommonVoiceDataset,
"peoplespeech": PeopleSpeechDataset,
"soda": SodaDataset,
"dummy": LibriSpeechDummyDataset,
}
return DATASET_MAP[name](args)
Expand Down
10 changes: 7 additions & 3 deletions ultravox/evaluation/eval.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
from ultravox.evaluation import eval_types
from ultravox.evaluation import gpt_eval
from ultravox.evaluation import gpt_eval_boolq
from ultravox.evaluation import gpt_eval_conv
from ultravox.evaluation import gpt_eval_instruct
from ultravox.evaluation import string_based
from ultravox.evaluation import wer

Expand All @@ -8,9 +10,11 @@ def evaluate_answer(sample: eval_types.Sample, metric: str) -> eval_types.Result
if metric == "asr":
return wer.evaluate_answer_asr(sample)
elif metric == "boolq":
return gpt_eval.evaluate_answer_boolq(sample)
return gpt_eval_boolq.evaluate_answer_boolq(sample)
elif metric == "instruct":
return gpt_eval.evaluate_answer_instruct(sample)
return gpt_eval_instruct.evaluate_answer_instruct(sample)
elif metric == "conversation":
return gpt_eval_conv.evaluate_conversation_response(sample)
elif metric == "exact_match_last_word":
return string_based.match_last_word(sample)
else:
Expand Down
3 changes: 2 additions & 1 deletion ultravox/evaluation/eval_types.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import dataclasses
from typing import Optional, Union
from typing import Dict, List, Optional, Union

import dataclasses_json

Expand All @@ -9,6 +9,7 @@ class Sample(dataclasses_json.DataClassJsonMixin):
question: str
generated_answer: str
expected_answer: str
history: List[Dict[str, str]] = dataclasses.field(default_factory=list)


@dataclasses.dataclass
Expand Down
48 changes: 3 additions & 45 deletions ultravox/evaluation/gpt_eval.py
Original file line number Diff line number Diff line change
@@ -1,46 +1,11 @@
import dataclasses
from typing import Optional

import jinja2
import openai

from ultravox.evaluation import eval_types

INSTRUCT_SYSTEM_PROMPT = f"""
You are an expert evaluator of AI systems.
Given a question with a specified instruction, you will be rating the correctness of an AI model's ability to follow that instruction.
Based on the supplied answer, and exemplary (correct) answer, you will rate the model's answer as either correct or incorrect.
Award 1 point if the model followed the instruction, and 0 points if it did not.
For example, given a question with an instruction of "Write a sentence about pickleball",
- if the model responds "Pickleball is a tennis-like game played with a wiffle ball.", you should award 1 point.
- if the model responds "Pickleball is a type of fruit", you should award 0 points.
- if the model responds with something off-topic or nonsensical, you should award 0 points.
Your response MUST start with either 0 or 1, followed by a space, and then an explanation for why you awarded that score.
"""
INSTRUCT_USER_PROMPT = """
Using the supplied correct answer as an example, evaluate the model's ability to follow the instructions in the question below:
Question: {question}
Model answer: {generated_answer}
Correct answer: {expected_answer}
"""


BOOLQ_SYSTEM_PROMPT = f"""
You are an expert evaluator of AI systems.
Given a question with a known true/false answer, you will be rating the correctness of an AI model's answer to that same question.
Based on the supplied question, answer, and expected (correct) answer, you will rate the model's answer as either correct or incorrect.
Award 1 point if the model's answer matches the correct answer, and 0 points if the model's answer does not match, or cannot be converted to a true/false verdict.
Model answers of the form "True", "Yes", "Yeah", etc., should be considered to match a True answer.
Model answers of the form "False", "No", "Incorrect", etc., should be considered to match a False answer.
Only use the supplied correct answer to make your decision; DO NOT use your own knowledge to determine correctness.
Your response MUST start with either 0 or 1, followed by a space, and then a brief explanation for why you awarded that score.
"""
BOOLQ_USER_PROMPT = """
Using the supplied correct answer as ground truth, evaluate the model's answer to the question below:
Question: {question}
Model answer: {generated_answer}
Correct answer: {expected_answer}
"""

RATING_MODEL = "gpt-4o"
client: Optional[openai.Client] = None

Expand All @@ -51,13 +16,14 @@ def _evaluate_answer_gpt(
global client
if client is None:
client = openai.Client()
template = jinja2.Template(user_prompt)
response = client.chat.completions.create(
model=RATING_MODEL,
messages=[
{"role": "system", "content": sys_prompt},
{
"role": "user",
"content": user_prompt.format(**dataclasses.asdict(sample)),
"content": template.render(**dataclasses.asdict(sample)),
},
],
max_tokens=50,
Expand All @@ -74,11 +40,3 @@ def _evaluate_answer_gpt(
pass

return eval_types.InstructResult(score=score, reason=rating_text[2:])


def evaluate_answer_boolq(sample: eval_types.Sample) -> eval_types.InstructResult:
return _evaluate_answer_gpt(BOOLQ_SYSTEM_PROMPT, BOOLQ_USER_PROMPT, sample)


def evaluate_answer_instruct(sample: eval_types.Sample) -> eval_types.InstructResult:
return _evaluate_answer_gpt(INSTRUCT_SYSTEM_PROMPT, INSTRUCT_USER_PROMPT, sample)
23 changes: 23 additions & 0 deletions ultravox/evaluation/gpt_eval_boolq.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
from ultravox.evaluation import eval_types
from ultravox.evaluation import gpt_eval

BOOLQ_SYSTEM_PROMPT = f"""
You are an expert evaluator of AI systems.
Given a question with a known true/false answer, you will be rating the correctness of an AI model's answer to that same question.
Based on the supplied question, answer, and expected (correct) answer, you will rate the model's answer as either correct or incorrect.
Award 1 point if the model's answer matches the correct answer, and 0 points if the model's answer does not match, or cannot be converted to a true/false verdict.
Model answers of the form "True", "Yes", "Yeah", etc., should be considered to match a True answer.
Model answers of the form "False", "No", "Incorrect", etc., should be considered to match a False answer.
Only use the supplied correct answer to make your decision; DO NOT use your own knowledge to determine correctness.
Your response MUST start with either 0 or 1, followed by a space, and then a brief explanation for why you awarded that score.
"""
BOOLQ_USER_PROMPT = """
Using the supplied correct answer as ground truth, evaluate the model's answer to the question below:
Question: {{ question }}
Model answer: {{ generated_answer }}
Correct answer: {{ expected_answer }}
"""


def evaluate_answer_boolq(sample: eval_types.Sample) -> eval_types.InstructResult:
return gpt_eval._evaluate_answer_gpt(BOOLQ_SYSTEM_PROMPT, BOOLQ_USER_PROMPT, sample)
farzadab marked this conversation as resolved.
Show resolved Hide resolved
29 changes: 29 additions & 0 deletions ultravox/evaluation/gpt_eval_conv.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
from ultravox.evaluation import eval_types
from ultravox.evaluation import gpt_eval

CONVO_SYSTEM_PROMPT = f"""
You are an expert evaluator of conversational AI systems.
Given a conversation between two parties, the role of the AI system was to follow the flow of the conversation and respond appropriately.
You are given the conversation, the AI model's response, and an exemplary (correct) response.
The AI model response might be truncated, but that should not affect your evaluation.
Your should award 1 point if the model's response is appropriate and follows the conversation, and 0 points if it does not, such as being off-topic or nonsensical.
Your response MUST start with either 0 or 1, followed by a space, and then an explanation for why you awarded that score.
"""

CONVO_USER_PROMPT = """
Using the supplied example of a correct answer, evaluate the model's ability to follow the flow of the conversation in the last message:

Conversation:
{%- for turn in history + [ {"role": "user", "content": question} ] %}
{% if turn["role"] == "user" %}A{% else %}B{% endif %}: {{ turn["content"] }}
{% endfor %}
Model (as B): {{ generated_answer }}
Correct: {{ expected_answer }}
"""


def evaluate_conversation_response(
sample: eval_types.Sample,
) -> eval_types.InstructResult:
sample.history = [msg for msg in sample.history if msg["role"] != "system"]
return gpt_eval._evaluate_answer_gpt(CONVO_SYSTEM_PROMPT, CONVO_USER_PROMPT, sample)
26 changes: 26 additions & 0 deletions ultravox/evaluation/gpt_eval_instruct.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
from ultravox.evaluation import eval_types
from ultravox.evaluation import gpt_eval

INSTRUCT_SYSTEM_PROMPT = f"""
You are an expert evaluator of AI systems.
Given a question with a specified instruction, you will be rating the correctness of an AI model's ability to follow that instruction.
Based on the supplied answer, and exemplary (correct) answer, you will rate the model's answer as either correct or incorrect.
Award 1 point if the model followed the instruction, and 0 points if it did not.
For example, given a question with an instruction of "Write a sentence about pickleball",
- if the model responds "Pickleball is a tennis-like game played with a wiffle ball.", you should award 1 point.
- if the model responds "Pickleball is a type of fruit", you should award 0 points.
- if the model responds with something off-topic or nonsensical, you should award 0 points.
Your response MUST start with either 0 or 1, followed by a space, and then an explanation for why you awarded that score.
"""
INSTRUCT_USER_PROMPT = """
Using the supplied correct answer as an example, evaluate the model's ability to follow the instructions in the question below:
Question: {{ question }}
Model answer: {{ generated_answer }}
Correct answer: {{ expected_answer }}
"""


def evaluate_answer_instruct(sample: eval_types.Sample) -> eval_types.InstructResult:
return gpt_eval._evaluate_answer_gpt(
INSTRUCT_SYSTEM_PROMPT, INSTRUCT_USER_PROMPT, sample
)
31 changes: 31 additions & 0 deletions ultravox/evaluation/gpt_eval_test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
import re
from unittest import mock

from ultravox.evaluation import eval_types
from ultravox.evaluation import gpt_eval
from ultravox.evaluation import gpt_eval_conv


def test_evaluate_conversation():
gpt_eval.client = mock.MagicMock()
sample = eval_types.Sample(
history=[
{"role": "system", "content": "Blah blah blah"},
{"role": "user", "content": "T1"},
{"role": "assistant", "content": "T2"},
],
question="T3",
generated_answer="T4",
expected_answer="EXP",
)
expected_turns = "A: T1\n\nB: T2\n\nA: T3\n\nModel (as B): T4\nCorrect: EXP"

gpt_eval_conv.evaluate_conversation_response(sample)

completion_args = gpt_eval.client.chat.completions.create.call_args[1]
assert len(completion_args["messages"]) == 2
assert completion_args["messages"][0]["role"] == "system"
assert completion_args["messages"][1]["role"] == "user"
gpt_question = re.sub("\n *", "\n", completion_args["messages"][1]["content"])
assert expected_turns in gpt_question
assert "Blah blah blah" not in gpt_question
10 changes: 5 additions & 5 deletions ultravox/tools/data_tool.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,11 +38,11 @@ def main(args: argparse.Namespace):
for i, sample in enumerate(out_set):
print(f"--- Sample {i} ---")
messages = sample.messages
assert len(messages) == 2, f"Bad sample (messages) {len(messages)}"
assert messages[0]["role"] == "user", f"Bad sample (Q role): {messages}"
assert messages[1]["role"] == "assistant", f"Bad sample (A role): {messages}"
answer = messages[1]["content"].replace("\n", "\\n")
print(f"Q: {messages[0]['content']} [\"{sample.audio_transcript}\"]")
assert len(messages) >= 2, f"Bad sample (messages) {len(messages)}"
assert messages[-1]["role"] == "user", f"Bad sample (Q role): {messages}"
assert messages[-2]["role"] == "assistant", f"Bad sample (A role): {messages}"
answer = messages[-2]["content"].replace("\n", "\\n")
print(f"Q: {messages[-1]['content']} [\"{sample.audio_transcript}\"]")
print(f"A: {answer}")
if args.play:
audio = sample.audio
Expand Down
22 changes: 16 additions & 6 deletions ultravox/tools/infer_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ def infer_stream(
headers["Authorization"] = f"Bearer {self._api_key}"
data = {
"model": self._model,
"messages": [self._build_message(sample)],
"messages": self._build_messages(sample),
"stream": True,
}
if max_tokens is not None:
Expand All @@ -68,19 +68,27 @@ def infer_stream(
obj["usage"]["prompt_tokens"], obj["usage"]["completion_tokens"]
)

def _build_message(self, sample: datasets.VoiceSample):
def _build_messages(self, sample: datasets.VoiceSample):
"""
Convert a VoiceSample into a list of messages for the OpenAI API.
This function assumes that if the sample has an audio field, it is in
the last message, indicated by a "<|audio|>" placeholder.

Audio is converted to a data URI and inserted into the message under an image_url type.
"""
if sample.audio is None:
return {"role": "user", "content": sample.messages[0]["content"]}
return sample

fragments = sample.messages[0]["content"].split("<|audio|>")
fragments = sample.messages[-1]["content"].split("<|audio|>")
assert len(fragments) == 2, "Expected one <|audio|> placeholder"
url = datasets.audio_to_data_uri(sample.audio, sample.sample_rate)
parts = [
{"type": "text", "text": fragments[0]},
{"type": "image_url", "image_url": {"url": url}},
{"type": "text", "text": fragments[1]},
]
return {"role": "user", "content": parts}
last_turn = {"role": "user", "content": parts}
return sample.messages[:-1] + [last_turn]


class DatabricksInference(base.VoiceInference):
Expand Down Expand Up @@ -124,7 +132,9 @@ def infer(
# audio as a file, not as a base64-encoded string. There's probably
# a better way to do this, but I spent too much time on this already.
# api = self._client.view_api(print_info=False, return_format="dict")
text = sample.messages[0]["content"]
text = sample.messages[0][
"content"
] # TODO: change regarding multiple messages?
if self._url.startswith("https://demo.tincans.ai"):
args: List[Any] = [text]
if sample.audio is not None:
Expand Down
Loading
Loading