Merge pull request SylphAI-Inc#371 from SylphAI-Inc/fix2

add gsm8k dataset
pgmiso · Feb 13, 2025 · 98c9e02 · 98c9e02
2 parents 57f71d3 + 6d6c092
commit 98c9e02
Show file tree

Hide file tree

Showing 11 changed files with 370 additions and 8 deletions.
diff --git a/adalflow/CHANGELOG.md b/adalflow/CHANGELOG.md
@@ -3,6 +3,7 @@
 
 ### Modified
 - `Embedder` and `BatchEmbedder` changed to `DataComponent`.
+- Add `GSM8K` dataset.
 
 ### model_client (added)
 - `list_models` method.

diff --git a/adalflow/adalflow/core/generator.py b/adalflow/adalflow/core/generator.py
@@ -550,9 +550,10 @@ def forward(
         log.debug(f"Predecessors: {predecessors} for generator {self.name}")
 
         def data_to_prompt_map_fn(data: Parameter) -> str:
+            """GeneratorOutput will show the raw response instead of just the final data.
+            The backward engine and optimizer should look at all reasoning to decide the gradient.
+            """
             data: GeneratorOutput = data.data
-            # if data.data is not None:
-            #     return data.data
             if data.error is not None:
                 return f"Response: {data.raw_response} parsed with error: {data.error}"
             return f" {data.raw_response}"

diff --git a/adalflow/adalflow/datasets/__init__.py b/adalflow/adalflow/datasets/__init__.py
@@ -1,7 +1,8 @@
 from .big_bench_hard import BigBenchHard
 from .hotpot_qa import HotPotQA
 from .trec import TrecDataset
-from .types import Example, HotPotQAData, TrecData
+from .types import Example, HotPotQAData, TrecData, GSM8KData
+from .gsm8k import GSM8K
 
 __all__ = [
     "BigBenchHard",
@@ -10,4 +11,6 @@
     "HotPotQAData",
     "TrecDataset",
     "TrecData",
+    "GSM8KData",
+    "GSM8K",
 ]
diff --git a/adalflow/adalflow/datasets/gsm8k.py b/adalflow/adalflow/datasets/gsm8k.py
@@ -0,0 +1,170 @@
+import random
+import os
+from typing import Literal
+import tqdm
+
+from adalflow.utils.lazy_import import safe_import, OptionalPackages
+
+
+from adalflow.utils.data import Dataset
+from adalflow.utils.file_io import save_json, load_json
+from adalflow.datasets.utils import prepare_dataset_path
+from adalflow.core.base_data_class import DataClass
+from adalflow.datasets.types import GSM8KData
+from adalflow.utils import printc
+
+
+class GSM8K(Dataset):
+    __doc__ = r""" Use huggingface datasets to load GSM8K dataset.
+
+            official_train: 7473
+            official_test: 1319
+
+            Our train split: 3736/2
+            Our val split: 3736/2
+            Our test split: 1319
+
+        You can use size to limit the number of examples to load.
+
+        Example:
+
+        .. code-block:: python
+
+            dataset = GSM8K(split="train", size=10)
+
+            print(f"example: {dataset[0]}")
+
+        The output will be:
+
+        .. code-block::
+
+            GSM8KData(id='8fc791e6-ea1d-472c-a882-d00d0600d423',
+            question="The result from the 40-item Statistics exam Marion and Ella took already came out.
+            Ella got 4 incorrect answers while Marion got 6 more than half the score of Ella.
+              What is Marion's score?",
+              answer='24',
+              gold_reasoning="Ella's score is 40 items - 4 items = <<40-4=36>>36 items.
+              Half of Ella's score is 36 items / 2 = <<36/2=18>>18 items.
+              So, Marion's score is 18 items + 6 items = <<18+6=24>>24 items.",
+              reasoning=None)
+        """
+
+    def __init__(
+        self,
+        root: str = None,
+        split: Literal["train", "val", "test"] = "train",
+        size: int = None,
+        **kwargs,
+    ) -> None:
+
+        if split not in ["train", "val", "test"]:
+            raise ValueError("Split must be one of 'train', 'val', 'test'")
+
+        self.root = root
+        self.task_name = "gsm8k"
+        data_path = prepare_dataset_path(self.root, self.task_name)
+        # download and save
+        split_csv_path = os.path.join(data_path, f"{split}.json")
+        print(f"split_csv_path: {split_csv_path}")
+        self._check_or_download_dataset(split_csv_path, split)
+
+        # load from csv
+        self.data = []
+
+        self.data = load_json(split_csv_path)
+        if size is not None:
+            self.data = self.data[:size]
+        # convert to dataclass
+        self.data = [GSM8KData.from_dict(d) for d in self.data]
+
+    def _check_or_download_dataset(
+        self,
+        data_path: str = None,
+        split: str = "train",
+    ):
+        r"""It will download data from huggingface datasets and split it and save it into three csv files.
+        Args:
+            data_path (str): The path to save the data. In particular with split name appended.
+            split (str): The dataset split, supports ``"train"`` (default), ``"val"`` and ``"test"``. Decides which split to return.
+            only_hard_examples (bool): If True, only hard examples will be downloaded.
+            keep_details (str): If "all", all details will be kept. If "dev_titles", only dev titles will be kept.
+        """
+
+        if data_path is None:
+            raise ValueError("data_path must be specified")
+
+        if os.path.exists(data_path):
+            return
+
+        safe_import(
+            OptionalPackages.DATASETS.value[0], OptionalPackages.DATASETS.value[1]
+        )
+        from datasets import load_dataset
+
+        # use huggingface cache
+        gsm8k_dataset = load_dataset("gsm8k", "main", cache_dir=self.root)
+
+        hf_official_train = gsm8k_dataset["train"]
+        hf_official_test = gsm8k_dataset["test"]
+
+        official_train = []
+        official_test = []
+
+        for example in tqdm.tqdm(hf_official_train):
+            question = example["question"]
+            answer = example["answer"].strip().split()
+            assert answer[-2] == "####"
+
+            gold_reasoning = " ".join(answer[:-2])
+            answer = str(int(answer[-1].replace(",", "")))
+            official_train.append(
+                dict(question=question, gold_reasoning=gold_reasoning, answer=answer)
+            )
+
+        for example in tqdm.tqdm(hf_official_test):
+            question = example["question"]
+            answer = example["answer"].strip().split()
+            assert answer[-2] == "####"
+
+            gold_reasoning = " ".join(answer[:-2])
+            answer = str(int(answer[-1].replace(",", "")))
+            official_test.append(
+                dict(question=question, gold_reasoning=gold_reasoning, answer=answer)
+            )
+
+        rng = random.Random(0)
+        rng.shuffle(official_train)  # 7473 train
+        rng = random.Random(0)
+        rng.shuffle(official_test)  # 1319 test
+
+        printc(f"official_train: {len(official_train)}")
+        printc(f"official_test: {len(official_test)}")
+        train_set = official_train[: len(official_train) * 50 // 100]
+        val_set = official_train[len(official_train) * 50 // 100 :]
+        data_path_dir = os.path.dirname(data_path)
+        for split, examples in zip(
+            ["train", "val", "test"],
+            [train_set, val_set, official_test],
+        ):
+            target_path = os.path.join(data_path_dir, f"{split}.json")
+            save_json(examples, f=target_path)
+
+        if split == "train":
+            return train_set
+        elif split == "val":
+            return val_set
+        else:
+            return official_test
+
+    def __getitem__(self, index) -> DataClass:
+        return self.data[index]
+
+    def __len__(self):
+        return len(self.data)
+
+
+if __name__ == "__main__":
+    dataset = GSM8K(split="train", size=10)
+
+    print(f"len: {len(dataset)}")
+    print(f"dataset[0]: {dataset[0]}")
diff --git a/adalflow/adalflow/datasets/types.py b/adalflow/adalflow/datasets/types.py
@@ -26,6 +26,27 @@ class Example(DataClass):
     answer: str = field(metadata={"desc": "The answer to the question"}, default=None)
 
 
+@dataclass
+class GSM8KData(Example):
+    __doc__ = """A dataclass for representing examples in the GSM8K dataset.
+
+    You can reset the output fields:
+
+    .. code-block:: python
+
+        GSM8KData.set_output_fields(["answer"])
+    """
+    gold_reasoning: str = field(
+        metadata={"desc": "The ground truth reasoning for the answer"}, default=None
+    )
+    reasoning: str = field(
+        metadata={"desc": "The reasoning for the answer"}, default=None
+    )  # your model's reasoning
+
+    __input_fields__ = ["question"]
+    __output_fields__ = ["reasoning", "answer"]  # default output fields
+
+
 @dataclass
 class HotPotQAData(Example):
     __doc__ = """A dataclass for representing examples in the HotPotQA dataset."""

diff --git a/adalflow/adalflow/optim/parameter.py b/adalflow/adalflow/optim/parameter.py
@@ -180,7 +180,7 @@ def __init__(
         self,
         *,
         id: Optional[str] = None,  # unique id of the parameter
-        data: T = None,  # for generator output, the data will be set up as raw_response
+        data: T = None,
         data_id: str = None,  # for tracing the data item in the training/val/test set
         requires_opt: bool = True,
         role_desc: str = "",
@@ -1613,7 +1613,9 @@ def __init__(
         score: Optional[float] = None,
         eval_input: object = None,
         successor_map_fn: Optional[Dict[str, Callable]] = None,
-        data_in_prompt: Optional[Callable] = None,
+        data_in_prompt: Optional[
+            Callable
+        ] = None,  # how will the data be displayed in the prompt
         full_response: Optional[Any] = None,
     ):
         super().__init__(

diff --git a/adalflow/adalflow/optim/trainer/trainer.py b/adalflow/adalflow/optim/trainer/trainer.py
@@ -985,7 +985,7 @@ def _fit_text_grads_one_step_for_debug(self, train_loader: Any) -> Dict[str, str
         correct_loss = None
         failed_loss = None
         all_losses = []
-        printc("Finding one successful and one failed loss", "blue")
+        printc("Finding one successful and one failed example", "blue")
         for batch in train_loader:
             y_preds = self.adaltask.train_step(batch, 0, self.num_workers)
             losses = self.adaltask.loss_step(batch, y_preds, 0, self.num_workers)
@@ -997,7 +997,7 @@ def _fit_text_grads_one_step_for_debug(self, train_loader: Any) -> Dict[str, str
                 else:
                     failed_loss = loss
             if correct_loss is not None and failed_loss is not None:
-                printc("Found correct and failed loss", "blue")
+                printc("Found correct and failed example", "blue")
                 break
         if not all_losses:
             raise ValueError("No losses found in the dataset.")

diff --git a/docs/source/index.rst b/docs/source/index.rst
@@ -466,7 +466,7 @@ Auto-optimize your LLM workflow with both Prompt Tuning and Few-shot Learning
 
          def load_datasets():
             train_data = TrecDataset(split="train")
-            val_data = TrecDataset(split="val")ßß
+            val_data = TrecDataset(split="val")
             test_data = TrecDataset(split="test")
             return train_data, val_data, test_data
 

diff --git a/docs/source/new_tutorials/embedder.rst b/docs/source/new_tutorials/embedder.rst
@@ -34,6 +34,7 @@ Unlike `Generator` which is trainable, `Embedder` is just a `DataComponent` that
 By switching the ``ModelClient``, you can use different embedding models in your task pipeline easily, or even embedd different data such as text, image, etc.
 For end developers, most likely you want to use :class:`ToEmbeddings<components.data_process.data_components.ToEmbeddings>` together with `Embedder` as it (1) directly supports a sequence of `Document` objects, and (2) it handles batch processing out of box.
 :class:`Document<core.types.Document>` is a container that AdalFlow uses to also process data in :class:`TextSplitter<components.data_process.text_splitter.TextSplitter>` which are often required in a RAG pipeline.
+
 .. EmbedderOutput
 .. --------------
 

diff --git a/use_cases/question_answering/gsm8k/task.py b/use_cases/question_answering/gsm8k/task.py
@@ -0,0 +1,73 @@
+from typing import Dict, Union
+import re
+import adalflow as adal
+
+template = r"""<START_OF_SYSTEM_PROMPT>
+{{system_prompt}}
+<END_OF_SYSTEM_PROMPT>
+<START_OF_USER_PROMPT>
+{{input_str}}
+<END_OF_USER_PROMPT>
+"""
+
+system_prompt_start = "You will answer a reasoning question. Think step by step. The last line of your response should be of the following format: 'Answer: $VALUE' where VALUE is a numerical value."
+
+
+@adal.func_to_data_component
+def parse_integer_answer(answer: str) -> str:
+    try:
+        numbers = re.findall(r"\d+", answer)
+        if numbers:
+            answer = numbers[-1]
+        else:
+            answer = ""
+    except ValueError:
+        answer = ""
+
+    return answer
+
+
+class GSM8KTask(adal.Component):
+    def __init__(self, model_client: adal.ModelClient, model_kwargs: Dict):
+        super().__init__()
+
+        system_prompt = adal.Parameter(
+            data=system_prompt_start,
+            role_desc="To give task instruction to the language model in the system prompt",
+            requires_opt=True,
+            param_type=adal.ParameterType.PROMPT,
+        )
+        self.generator = adal.Generator(
+            model_client=model_client,
+            model_kwargs=model_kwargs,
+            prompt_kwargs={
+                "system_prompt": system_prompt,
+            },
+            template=template,
+            output_processors=parse_integer_answer,
+            use_cache=True,
+        )
+
+    def bicall(
+        self, question: str, id: str = None
+    ) -> Union[adal.GeneratorOutput, adal.Parameter]:
+        output = self.generator(prompt_kwargs={"input_str": question}, id=id)
+        return output
+
+
+if __name__ == "__main__":
+    from adalflow.utils import setup_env
+    from adalflow.datasets.gsm8k import GSM8K
+
+    setup_env()
+
+    from use_cases.config import gpt_3_model
+
+    task = GSM8KTask(**gpt_3_model)
+
+    train_dataset = GSM8K(split="train", size=10)
+
+    print("example: ", train_dataset[0])
+
+    output = task(question=train_dataset[0].question)
+    print("output: ", output)