Skip to content

Commit

Permalink
Merge pull request SylphAI-Inc#371 from SylphAI-Inc/fix2
Browse files Browse the repository at this point in the history
add gsm8k dataset
  • Loading branch information
Sylph-AI authored Feb 13, 2025
2 parents 57f71d3 + 6d6c092 commit 98c9e02
Show file tree
Hide file tree
Showing 11 changed files with 370 additions and 8 deletions.
1 change: 1 addition & 0 deletions adalflow/CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@

### Modified
- `Embedder` and `BatchEmbedder` changed to `DataComponent`.
- Add `GSM8K` dataset.

### model_client (added)
- `list_models` method.
Expand Down
5 changes: 3 additions & 2 deletions adalflow/adalflow/core/generator.py
Original file line number Diff line number Diff line change
Expand Up @@ -550,9 +550,10 @@ def forward(
log.debug(f"Predecessors: {predecessors} for generator {self.name}")

def data_to_prompt_map_fn(data: Parameter) -> str:
"""GeneratorOutput will show the raw response instead of just the final data.
The backward engine and optimizer should look at all reasoning to decide the gradient.
"""
data: GeneratorOutput = data.data
# if data.data is not None:
# return data.data
if data.error is not None:
return f"Response: {data.raw_response} parsed with error: {data.error}"
return f" {data.raw_response}"
Expand Down
5 changes: 4 additions & 1 deletion adalflow/adalflow/datasets/__init__.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
from .big_bench_hard import BigBenchHard
from .hotpot_qa import HotPotQA
from .trec import TrecDataset
from .types import Example, HotPotQAData, TrecData
from .types import Example, HotPotQAData, TrecData, GSM8KData
from .gsm8k import GSM8K

__all__ = [
"BigBenchHard",
Expand All @@ -10,4 +11,6 @@
"HotPotQAData",
"TrecDataset",
"TrecData",
"GSM8KData",
"GSM8K",
]
170 changes: 170 additions & 0 deletions adalflow/adalflow/datasets/gsm8k.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,170 @@
import random
import os
from typing import Literal
import tqdm

from adalflow.utils.lazy_import import safe_import, OptionalPackages


from adalflow.utils.data import Dataset
from adalflow.utils.file_io import save_json, load_json
from adalflow.datasets.utils import prepare_dataset_path
from adalflow.core.base_data_class import DataClass
from adalflow.datasets.types import GSM8KData
from adalflow.utils import printc


class GSM8K(Dataset):
__doc__ = r""" Use huggingface datasets to load GSM8K dataset.
official_train: 7473
official_test: 1319
Our train split: 3736/2
Our val split: 3736/2
Our test split: 1319
You can use size to limit the number of examples to load.
Example:
.. code-block:: python
dataset = GSM8K(split="train", size=10)
print(f"example: {dataset[0]}")
The output will be:
.. code-block::
GSM8KData(id='8fc791e6-ea1d-472c-a882-d00d0600d423',
question="The result from the 40-item Statistics exam Marion and Ella took already came out.
Ella got 4 incorrect answers while Marion got 6 more than half the score of Ella.
What is Marion's score?",
answer='24',
gold_reasoning="Ella's score is 40 items - 4 items = <<40-4=36>>36 items.
Half of Ella's score is 36 items / 2 = <<36/2=18>>18 items.
So, Marion's score is 18 items + 6 items = <<18+6=24>>24 items.",
reasoning=None)
"""

def __init__(
self,
root: str = None,
split: Literal["train", "val", "test"] = "train",
size: int = None,
**kwargs,
) -> None:

if split not in ["train", "val", "test"]:
raise ValueError("Split must be one of 'train', 'val', 'test'")

self.root = root
self.task_name = "gsm8k"
data_path = prepare_dataset_path(self.root, self.task_name)
# download and save
split_csv_path = os.path.join(data_path, f"{split}.json")
print(f"split_csv_path: {split_csv_path}")
self._check_or_download_dataset(split_csv_path, split)

# load from csv
self.data = []

self.data = load_json(split_csv_path)
if size is not None:
self.data = self.data[:size]
# convert to dataclass
self.data = [GSM8KData.from_dict(d) for d in self.data]

def _check_or_download_dataset(
self,
data_path: str = None,
split: str = "train",
):
r"""It will download data from huggingface datasets and split it and save it into three csv files.
Args:
data_path (str): The path to save the data. In particular with split name appended.
split (str): The dataset split, supports ``"train"`` (default), ``"val"`` and ``"test"``. Decides which split to return.
only_hard_examples (bool): If True, only hard examples will be downloaded.
keep_details (str): If "all", all details will be kept. If "dev_titles", only dev titles will be kept.
"""

if data_path is None:
raise ValueError("data_path must be specified")

if os.path.exists(data_path):
return

safe_import(
OptionalPackages.DATASETS.value[0], OptionalPackages.DATASETS.value[1]
)
from datasets import load_dataset

# use huggingface cache
gsm8k_dataset = load_dataset("gsm8k", "main", cache_dir=self.root)

hf_official_train = gsm8k_dataset["train"]
hf_official_test = gsm8k_dataset["test"]

official_train = []
official_test = []

for example in tqdm.tqdm(hf_official_train):
question = example["question"]
answer = example["answer"].strip().split()
assert answer[-2] == "####"

gold_reasoning = " ".join(answer[:-2])
answer = str(int(answer[-1].replace(",", "")))
official_train.append(
dict(question=question, gold_reasoning=gold_reasoning, answer=answer)
)

for example in tqdm.tqdm(hf_official_test):
question = example["question"]
answer = example["answer"].strip().split()
assert answer[-2] == "####"

gold_reasoning = " ".join(answer[:-2])
answer = str(int(answer[-1].replace(",", "")))
official_test.append(
dict(question=question, gold_reasoning=gold_reasoning, answer=answer)
)

rng = random.Random(0)
rng.shuffle(official_train) # 7473 train
rng = random.Random(0)
rng.shuffle(official_test) # 1319 test

printc(f"official_train: {len(official_train)}")
printc(f"official_test: {len(official_test)}")
train_set = official_train[: len(official_train) * 50 // 100]
val_set = official_train[len(official_train) * 50 // 100 :]
data_path_dir = os.path.dirname(data_path)
for split, examples in zip(
["train", "val", "test"],
[train_set, val_set, official_test],
):
target_path = os.path.join(data_path_dir, f"{split}.json")
save_json(examples, f=target_path)

if split == "train":
return train_set
elif split == "val":
return val_set
else:
return official_test

def __getitem__(self, index) -> DataClass:
return self.data[index]

def __len__(self):
return len(self.data)


if __name__ == "__main__":
dataset = GSM8K(split="train", size=10)

print(f"len: {len(dataset)}")
print(f"dataset[0]: {dataset[0]}")
21 changes: 21 additions & 0 deletions adalflow/adalflow/datasets/types.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,27 @@ class Example(DataClass):
answer: str = field(metadata={"desc": "The answer to the question"}, default=None)


@dataclass
class GSM8KData(Example):
__doc__ = """A dataclass for representing examples in the GSM8K dataset.
You can reset the output fields:
.. code-block:: python
GSM8KData.set_output_fields(["answer"])
"""
gold_reasoning: str = field(
metadata={"desc": "The ground truth reasoning for the answer"}, default=None
)
reasoning: str = field(
metadata={"desc": "The reasoning for the answer"}, default=None
) # your model's reasoning

__input_fields__ = ["question"]
__output_fields__ = ["reasoning", "answer"] # default output fields


@dataclass
class HotPotQAData(Example):
__doc__ = """A dataclass for representing examples in the HotPotQA dataset."""
Expand Down
6 changes: 4 additions & 2 deletions adalflow/adalflow/optim/parameter.py
Original file line number Diff line number Diff line change
Expand Up @@ -180,7 +180,7 @@ def __init__(
self,
*,
id: Optional[str] = None, # unique id of the parameter
data: T = None, # for generator output, the data will be set up as raw_response
data: T = None,
data_id: str = None, # for tracing the data item in the training/val/test set
requires_opt: bool = True,
role_desc: str = "",
Expand Down Expand Up @@ -1613,7 +1613,9 @@ def __init__(
score: Optional[float] = None,
eval_input: object = None,
successor_map_fn: Optional[Dict[str, Callable]] = None,
data_in_prompt: Optional[Callable] = None,
data_in_prompt: Optional[
Callable
] = None, # how will the data be displayed in the prompt
full_response: Optional[Any] = None,
):
super().__init__(
Expand Down
4 changes: 2 additions & 2 deletions adalflow/adalflow/optim/trainer/trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -985,7 +985,7 @@ def _fit_text_grads_one_step_for_debug(self, train_loader: Any) -> Dict[str, str
correct_loss = None
failed_loss = None
all_losses = []
printc("Finding one successful and one failed loss", "blue")
printc("Finding one successful and one failed example", "blue")
for batch in train_loader:
y_preds = self.adaltask.train_step(batch, 0, self.num_workers)
losses = self.adaltask.loss_step(batch, y_preds, 0, self.num_workers)
Expand All @@ -997,7 +997,7 @@ def _fit_text_grads_one_step_for_debug(self, train_loader: Any) -> Dict[str, str
else:
failed_loss = loss
if correct_loss is not None and failed_loss is not None:
printc("Found correct and failed loss", "blue")
printc("Found correct and failed example", "blue")
break
if not all_losses:
raise ValueError("No losses found in the dataset.")
Expand Down
2 changes: 1 addition & 1 deletion docs/source/index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -466,7 +466,7 @@ Auto-optimize your LLM workflow with both Prompt Tuning and Few-shot Learning
def load_datasets():
train_data = TrecDataset(split="train")
val_data = TrecDataset(split="val")ßß
val_data = TrecDataset(split="val")
test_data = TrecDataset(split="test")
return train_data, val_data, test_data
Expand Down
1 change: 1 addition & 0 deletions docs/source/new_tutorials/embedder.rst
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ Unlike `Generator` which is trainable, `Embedder` is just a `DataComponent` that
By switching the ``ModelClient``, you can use different embedding models in your task pipeline easily, or even embedd different data such as text, image, etc.
For end developers, most likely you want to use :class:`ToEmbeddings<components.data_process.data_components.ToEmbeddings>` together with `Embedder` as it (1) directly supports a sequence of `Document` objects, and (2) it handles batch processing out of box.
:class:`Document<core.types.Document>` is a container that AdalFlow uses to also process data in :class:`TextSplitter<components.data_process.text_splitter.TextSplitter>` which are often required in a RAG pipeline.

.. EmbedderOutput
.. --------------
Expand Down
73 changes: 73 additions & 0 deletions use_cases/question_answering/gsm8k/task.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
from typing import Dict, Union
import re
import adalflow as adal

template = r"""<START_OF_SYSTEM_PROMPT>
{{system_prompt}}
<END_OF_SYSTEM_PROMPT>
<START_OF_USER_PROMPT>
{{input_str}}
<END_OF_USER_PROMPT>
"""

system_prompt_start = "You will answer a reasoning question. Think step by step. The last line of your response should be of the following format: 'Answer: $VALUE' where VALUE is a numerical value."


@adal.func_to_data_component
def parse_integer_answer(answer: str) -> str:
try:
numbers = re.findall(r"\d+", answer)
if numbers:
answer = numbers[-1]
else:
answer = ""
except ValueError:
answer = ""

return answer


class GSM8KTask(adal.Component):
def __init__(self, model_client: adal.ModelClient, model_kwargs: Dict):
super().__init__()

system_prompt = adal.Parameter(
data=system_prompt_start,
role_desc="To give task instruction to the language model in the system prompt",
requires_opt=True,
param_type=adal.ParameterType.PROMPT,
)
self.generator = adal.Generator(
model_client=model_client,
model_kwargs=model_kwargs,
prompt_kwargs={
"system_prompt": system_prompt,
},
template=template,
output_processors=parse_integer_answer,
use_cache=True,
)

def bicall(
self, question: str, id: str = None
) -> Union[adal.GeneratorOutput, adal.Parameter]:
output = self.generator(prompt_kwargs={"input_str": question}, id=id)
return output


if __name__ == "__main__":
from adalflow.utils import setup_env
from adalflow.datasets.gsm8k import GSM8K

setup_env()

from use_cases.config import gpt_3_model

task = GSM8KTask(**gpt_3_model)

train_dataset = GSM8K(split="train", size=10)

print("example: ", train_dataset[0])

output = task(question=train_dataset[0].question)
print("output: ", output)
Loading

0 comments on commit 98c9e02

Please sign in to comment.