diff --git a/adalflow/adalflow/optim/trainer/trainer.py b/adalflow/adalflow/optim/trainer/trainer.py
index 298af441..7fbc8ae1 100644
--- a/adalflow/adalflow/optim/trainer/trainer.py
+++ b/adalflow/adalflow/optim/trainer/trainer.py
@@ -1582,7 +1582,7 @@ def _fit_demos_random(
                         trainer_results,
                         last_val_score,
                         test_score=trainer_results.test_scores[-1],
-                        prompts=trainer_results.prompts[-1],
+                        prompts=trainer_results.step_results[-1].prompt,
                         step=step,
                         attempted_val_score=val_score,
                     )
diff --git a/benchmarks/BHH_object_count/text_grad/text_grad_train.py b/benchmarks/BHH_object_count/text_grad/text_grad_train.py
index 7acf8c00..725e1ce7 100644
--- a/benchmarks/BHH_object_count/text_grad/text_grad_train.py
+++ b/benchmarks/BHH_object_count/text_grad/text_grad_train.py
@@ -1,3 +1,11 @@
+"""
+Text grad's object count implementation:
+
+self._task_description = "You will answer a reasoning question. Think step by step. The last line of your response should be of the following format: 'Answer: $VALUE' where VALUE is a numerical value."
+
+We use the same task description, the only difference is dspy send over a messages: [system_prompt, user_message] and we do ["system": <> system_prompt <> <> user_message<>]
+"""
+
 import logging
 
 
diff --git a/benchmarks/BHH_object_count/text_grad/trec_6_train.py b/benchmarks/BHH_object_count/text_grad/trec_6_train.py
index d5f47198..e546fb48 100644
--- a/benchmarks/BHH_object_count/text_grad/trec_6_train.py
+++ b/benchmarks/BHH_object_count/text_grad/trec_6_train.py
@@ -365,3 +365,9 @@ def multi_run_train(num_runs=4):
     #     eval_fn=eval_fn,
     # )
     multi_run_train(num_runs=4)
+
+#     Average Training Time:  1383.4382199645042
+# Average Test Score:  0.8488023952095808
+# Average Val Score:  0.8373493975903614
+# Std Test Score:  0.011691990532794386
+# Std Val Score:  0.01127005236980105
diff --git a/benchmarks/hotpot_qa/adal_exp/build_multi_hop_rag.py b/benchmarks/hotpot_qa/adal_exp/build_multi_hop_rag.py
index 3d50e25d..af2e2c9d 100644
--- a/benchmarks/hotpot_qa/adal_exp/build_multi_hop_rag.py
+++ b/benchmarks/hotpot_qa/adal_exp/build_multi_hop_rag.py
@@ -76,21 +76,21 @@ class QueriesOutput(adal.DataClass):
     )
 
 
-class DeduplicateList(adal.GradComponent):
-    def __init__(self):
-        super().__init__()
+# class DeduplicateList(adal.GradComponent):
+#     def __init__(self):
+#         super().__init__()
 
-    def call(
-        self, exisiting_list: List[str], new_list: List[str], id: str = None
-    ) -> List[str]:
+#     def call(
+#         self, exisiting_list: List[str], new_list: List[str], id: str = None
+#     ) -> List[str]:
 
-        seen = set()
-        return [x for x in exisiting_list + new_list if not (x in seen or seen.add(x))]
+#         seen = set()
+#         return [x for x in exisiting_list + new_list if not (x in seen or seen.add(x))]
 
-    def backward(self, *args, **kwargs):
+#     def backward(self, *args, **kwargs):
 
-        printc(f"DeduplicateList backward: {args}", "yellow")
-        return super().backward(*args, **kwargs)
+#         printc(f"DeduplicateList backward: {args}", "yellow")
+#         return super().backward(*args, **kwargs)
 
 
 class CombineList(GradComponent2):
@@ -215,7 +215,6 @@ def __init__(self, model_client, model_kwargs, passages_per_hop=3, max_hops=2):
         )
 
         self.retriever = DspyRetriever(top_k=passages_per_hop)
-        self.deduplicater = DeduplicateList()
         self.combine_list = CombineList()
 
     @staticmethod
@@ -379,14 +378,14 @@ def __init__(self, model_client, model_kwargs, passages_per_hop=3, max_hops=2):
                     model_client=model_client,
                     model_kwargs=model_kwargs,
                     prompt_kwargs={
-                        # "few_shot_demos": Parameter(
-                        #     name=f"few_shot_demos_{i}",
-                        #     # data=few_shot_demos[i],
-                        #     data=None,
-                        #     role_desc="To provide few shot demos to the language model",
-                        #     requires_opt=True,
-                        #     param_type=ParameterType.DEMOS,
-                        # ),
+                        "few_shot_demos": Parameter(
+                            name=f"few_shot_demos_{i}",
+                            # data=few_shot_demos[i],
+                            data=None,
+                            role_desc="To provide few shot demos to the language model",
+                            requires_opt=True,
+                            param_type=ParameterType.DEMOS,
+                        ),
                         "task_desc_str": Parameter(
                             name="task_desc_str",
                             data=task_desc_str,
@@ -411,7 +410,6 @@ def __init__(self, model_client, model_kwargs, passages_per_hop=3, max_hops=2):
                 )
             )
             self.retrievers.append(DspyRetriever(top_k=passages_per_hop))
-            self.deduplicaters.append(DeduplicateList())
 
         self.combine_list = CombineList()
         self.combine_queries = CombineQueries()
@@ -456,36 +454,7 @@ def call(self, *, input: str, id: str = None) -> adal.RetrieverOutput:
         )
         return out
 
-    def call2(self, *, input: str, id: str = None) -> str:
-        context = []
-        queries: List[str] = []
-        last_query = None
-        for i in range(self.max_hops):
-            gen_out = self.query_generators[i](
-                prompt_kwargs={
-                    "context": context,
-                    "question": input,
-                    "last_query": last_query,
-                },
-                id=id,
-            )
-
-            query = gen_out.data.query if gen_out.data and gen_out.data.query else input
-
-            retrieve_out = self.retrievers[i](input=query, id=id)
-
-            passages = retrieve_out.documents
-            context = self.deduplicate(context + passages)
-            queries.append(query)
-            last_query = query
-        out = ", ".join(queries)
-        query_output = QueriesOutput(data=out, id=id)
-        return query_output
-
     def forward(self, *, input: str, id: str = None) -> adal.Parameter:
-        # assemble the foundamental building blocks
-        # printc(f"question: {input}", "yellow")
-        # context = []
 
         queries: List[str] = []
 
@@ -530,15 +499,6 @@ def forward(self, *, input: str, id: str = None) -> adal.Parameter:
             def retrieve_out_map_fn(x: adal.Parameter):
                 return x.data.documents if x.data and x.data.documents else []
 
-            # print(f"retrieve_out: {retrieve_out}")
-
-            # retrieve_out.add_successor_map_fn(
-            #     successor=self.deduplicaters[i], map_fn=retrieve_out_map_fn
-            # )
-
-            # context = self.deduplicaters[i].forward(
-            #     exisiting_list=context, new_list=retrieve_out
-            # )
             retrieve_out.data_in_prompt = lambda x: {
                 "query": x.data.query,
                 "documents": x.data.documents,
@@ -550,13 +510,6 @@ def retrieve_out_map_fn(x: adal.Parameter):
                 )
                 last_query = success_map_fn(gen_out)
             contexts.append(retrieve_out)
-            # if i + 1 < self.max_hops:
-            #     retrieve_out.add_successor_map_fn(
-            #         successor=self.query_generators[i + 1], map_fn=retrieve_out_map_fn
-            #     )
-
-            #     last_query = success_map_fn(gen_out)
-            # printc(f"retrieve_out, last_query: {last_query}")
 
         contexts[0].add_successor_map_fn(
             successor=self.combine_list, map_fn=lambda x: x.data
@@ -645,22 +598,7 @@ def retrieve_out_map_fn(x: adal.Parameter):
                 )
 
                 last_query = success_map_fn(gen_out)
-            # printc(f"retrieve_out, last_query: {last_query}")
 
-        # contexts[0].add_successor_map_fn(
-        #     successor=self.combine_list, map_fn=lambda x: x.data
-        # )
-        # contexts[1].add_successor_map_fn(
-        #     successor=self.combine_list, map_fn=lambda x: x.data
-        # )
-        # contexts_sum = self.combine_list.forward(
-        #     context_1=contexts[0], context_2=contexts[1]
-        # )
-        # contexts_sum.data_in_prompt = lambda x: {
-        #     "query": x.data.query,
-        #     "documents": x.data.documents,
-        # }
-        # setattr(contexts_sum, "queries", [q.data.data.query for q in queries])
         queries[0].add_successor_map_fn(
             successor=self.combine_queries, map_fn=lambda x: x.data.data.query
         )
diff --git a/benchmarks/hotpot_qa/adal_exp/build_vanilla_rag.py b/benchmarks/hotpot_qa/adal_exp/build_vanilla_rag.py
index 9f0668a6..89ed60fb 100644
--- a/benchmarks/hotpot_qa/adal_exp/build_vanilla_rag.py
+++ b/benchmarks/hotpot_qa/adal_exp/build_vanilla_rag.py
@@ -169,13 +169,13 @@ def __init__(self, passages_per_hop=3, model_client=None, model_kwargs=None):
                     instruction_to_optimizer="ou need find the best way(where does the right answer come from the context) to extract the RIGHT answer from the context.",
                     # + "Given existing context, ensure the task instructions can maximize the performance.",
                 ),
-                # "few_shot_demos": adal.Parameter(
-                #     # data=demo_str,
-                #     data=None,
-                #     requires_opt=True,
-                #     role_desc="To provide few shot demos to the language model",
-                #     param_type=adal.ParameterType.DEMOS,
-                # ),
+                "few_shot_demos": adal.Parameter(
+                    # data=demo_str,
+                    data=None,
+                    requires_opt=True,
+                    role_desc="To provide few shot demos to the language model",
+                    param_type=adal.ParameterType.DEMOS,
+                ),
                 "output_format_str": self.llm_parser.get_output_format_str(),
                 # "output_format_str": adal.Parameter(
                 #     data=self.llm_parser.get_output_format_str(),
diff --git a/benchmarks/hotpot_qa/adal_exp/train_multi_hop_rag.py b/benchmarks/hotpot_qa/adal_exp/train_multi_hop_rag.py
index 0f890263..666ad295 100644
--- a/benchmarks/hotpot_qa/adal_exp/train_multi_hop_rag.py
+++ b/benchmarks/hotpot_qa/adal_exp/train_multi_hop_rag.py
@@ -206,8 +206,8 @@ def train(
     # train_diagnose(**gpt_3_model)
 
     ckpt = train(
-        debug=True,
-        max_steps=24,
+        debug=False,
+        max_steps=12,
         seed=2025,  # pass the numpy seed
         tg=use_tg,
         strategy=set_strategy,
diff --git a/benchmarks/hotpot_qa/dspy_multi_hop_rag.py b/benchmarks/hotpot_qa/dspy_multi_hop_rag.py
index 5e7cf0db..bd2c394a 100644
--- a/benchmarks/hotpot_qa/dspy_multi_hop_rag.py
+++ b/benchmarks/hotpot_qa/dspy_multi_hop_rag.py
@@ -45,6 +45,12 @@ class GenerateAnswer(dspy.Signature):
     answer = dspy.OutputField(desc="answer to the question")
 
 
+task_desc_str = """
+You will receive an original question, last search query, and the retrieved context from the last search query.
+Write the next search query to help retrieve all relevant context to answer the original question.
+Think step by step."""
+
+
 class GenerateSearchQuery(dspy.Signature):
     """Write a simple search query that will help answer a complex question."""
 
@@ -53,6 +59,17 @@ class GenerateSearchQuery(dspy.Signature):
     query = dspy.OutputField()
 
 
+# class GenerateSearchQuery(dspy.Signature):
+#     """You will receive an original question, last search query, and the retrieved context from the last search query.
+#     Write the next search query to help retrieve all relevant context to answer the original question.
+#     """
+
+#     context = dspy.InputField(desc="may contain relevant facts")
+#     question = dspy.InputField()
+#     last_search_query = dspy.InputField(desc="The last search query")
+#     query = dspy.OutputField()
+
+
 from dsp.utils import deduplicate
 
 
@@ -69,9 +86,14 @@ def __init__(self, passages_per_hop=2, max_hops=2):
 
     def forward(self, question):
         context = []
+        # last_query = None
 
         for hop in range(self.max_hops):
-            query = self.generate_query[hop](context=context, question=question).query
+            query = self.generate_query[hop](
+                context=context,
+                question=question,  # last_search_query=last_query
+            ).query
+            # last_query = query
             passages = self.retrieve(query).passages
             context = deduplicate(context + passages)
 
@@ -100,16 +122,17 @@ def train_MIPROv2(trainset, valset, save_path, filename):
         metric=validate_answer,
         prompt_model=gpt_4,
         task_model=turbo,
-        num_candidates=30,
+        num_candidates=12,
         init_temperature=1.0,
+        log_dir=save_path,
     )
     compiled_task = tp.compile(
         DsPyMultiHopRAG(),
         trainset=trainset,
         valset=valset,
-        max_bootstrapped_demos=5,
-        max_labeled_demos=2,
-        num_batches=12,  # MINIBATCH_SIZE = 25,
+        max_bootstrapped_demos=0,  # 2,
+        max_labeled_demos=0,  # 2,
+        num_batches=12,  # MINIBATCH_SIZE = 25, (eval on trainset)
         seed=2025,
         requires_permission_to_run=False,
     )
@@ -136,7 +159,7 @@ def train_MIPROv2(trainset, valset, save_path, filename):
 #     return compiled_baleen
 
 
-def validate(devset, compiled_baleen, uncompiled_baleen):
+def validate(devset, compiled_baleen=None, uncompiled_baleen=None):
     from dspy.evaluate.evaluate import Evaluate
 
     # Define metric to check if we retrieved the correct documents
@@ -155,10 +178,13 @@ def validate(devset, compiled_baleen, uncompiled_baleen):
         display_table=5,
         # metric=validate_answer,
     )
-    uncompiled_baleen_answer_score = evaluate_on_hotpotqa(
-        uncompiled_baleen, metric=validate_answer, display_progress=True
-    )
-    print(f"## Answer Score for uncompiled Baleen: {uncompiled_baleen_answer_score}")
+    if uncompiled_baleen is not None:
+        uncompiled_baleen_answer_score = evaluate_on_hotpotqa(
+            uncompiled_baleen, metric=validate_answer, display_progress=True
+        )
+        print(
+            f"## Answer Score for uncompiled Baleen: {uncompiled_baleen_answer_score}"
+        )
 
     if compiled_baleen is None:
         return
@@ -167,12 +193,13 @@ def validate(devset, compiled_baleen, uncompiled_baleen):
         compiled_baleen, metric=validate_answer, display_progress=True
     )
     print(f"## Answer Score for compiled Baleen: {compiled_baleen_answer_score}")
+    return compiled_baleen_answer_score
 
 
 def train_and_validate():
     import os
 
-    save_path = "benchmarks/hotpot_qa/dspy_multi_hop_rag"
+    save_path = "benchmarks/hotpot_qa/dspy_multi_hop_rag_zero_shot"
     if not os.path.exists(save_path):
         os.makedirs(save_path)
 
@@ -184,6 +211,8 @@ def train_and_validate():
     val_accs = []
     test_accs = []
     training_times = []
+    max_val_score = 0
+    max_test_score = 0
 
     num_runs = 4
 
@@ -194,8 +223,8 @@ def train_and_validate():
         compiled_count = train_MIPROv2(
             dspy_trainset, dspy_valset, save_path, output_file
         )
-        val_acc = validate(dspy_valset, compiled_count)
-        test_acc = validate(dspy_testset, compiled_count)
+        val_acc = validate(dspy_valset, compiled_count)  # 46
+        test_acc = validate(dspy_testset, compiled_count)  # 52
 
         val_accs.append(val_acc)
         test_accs.append(test_acc)
@@ -208,11 +237,14 @@ def train_and_validate():
     val_accs = np.array(val_accs)
     test_accs = np.array(test_accs)
     training_times = np.array(training_times)
-
+    max_val_score = val_accs.max()
+    max_test_score = test_accs.max()
     print("Validation accuracy:", val_accs.mean(), val_accs.std())
     print("Test accuracy:", test_accs.mean(), test_accs.std())
 
     print("Training time:", training_times.mean())
+    print("Max val score: ", max_val_score)
+    print("Max test score: ", max_test_score)
 
 
 if __name__ == "__main__":
@@ -246,6 +278,14 @@ def train_and_validate():
     # compiled_baleen = train(trainset, dspy_save_path, "hotpotqa.json")
     # validate(devset, compiled_baleen, uncompiled_baleen)
 
-    # dspy 16 raw shots, 4 demos
-    # dspy supports multiple generators,  in this case 3. Two query generator and one answer generator, they all choose the same examples.
-    # accuracy 62.0
+    # with demos (2, 2)
+    # Validation accuracy: 47.25 3.031088913245535
+    # Test accuracy: 50.625 3.0898017735770686
+    # Training time: 2465.3250265717506
+
+    # zero shot
+    # Validation accuracy: 35.5 4.330127018922194
+    # Test accuracy: 37.875 5.140221298738022
+    # Training time: 182.31551551818848
+    # Max val score:  42.0
+    # Max test score:  46.5
diff --git a/benchmarks/hotpot_qa/dspy_multi_hop_rag_cycle.py b/benchmarks/hotpot_qa/dspy_multi_hop_rag_cycle.py
new file mode 100644
index 00000000..c23e3945
--- /dev/null
+++ b/benchmarks/hotpot_qa/dspy_multi_hop_rag_cycle.py
@@ -0,0 +1,292 @@
+"""
+Dspy's CoT is handled by itself with ChainOfThought. That is why our prompt (signature) does not need to include "think step by step".
+"""
+
+import dspy
+import dspy.evaluate
+
+from dspy import Example
+
+
+turbo = dspy.OpenAI(model="gpt-3.5-turbo-0125")
+gpt_4 = dspy.OpenAI(model="gpt-4o")
+
+colbertv2_wiki17_abstracts = dspy.ColBERTv2(
+    url="http://20.102.90.50:2017/wiki17_abstracts"
+)
+
+dspy.settings.configure(lm=turbo, rm=colbertv2_wiki17_abstracts)
+from adalflow.eval.answer_match_acc import AnswerMatchAcc
+
+
+def load_datasets():
+    from benchmarks.hotpot_qa.config import load_datasets
+
+    trainset, valset, testset = load_datasets()
+    # dspy requires us to package the dataset to Example objects and specify the inputs
+
+    dspy_trainset, dspy_valset, dspy_testset = [], [], []
+    for dataset in zip(
+        [trainset, valset, testset], [dspy_trainset, dspy_valset, dspy_testset]
+    ):
+        for item in dataset[0]:
+            example = Example(question=item.question, answer=item.answer)
+            example = example.with_inputs("question")
+            dataset[1].append(example)
+
+    return dspy_trainset, dspy_valset, dspy_testset
+
+
+class GenerateAnswer(dspy.Signature):
+    """Answer questions with short factoid answers."""
+
+    context = dspy.InputField(desc="may contain relevant facts")
+    question = dspy.InputField()
+    answer = dspy.OutputField(desc="answer to the question")
+
+
+task_desc_str = """
+You will receive an original question, last search query, and the retrieved context from the last search query.
+Write the next search query to help retrieve all relevant context to answer the original question.
+Think step by step."""
+
+
+class GenerateSearchQuery(dspy.Signature):
+    """Write a simple search query that will help answer a complex question."""
+
+    context = dspy.InputField(desc="may contain relevant facts")
+    question = dspy.InputField()
+    query = dspy.OutputField()
+
+
+# class GenerateSearchQuery(dspy.Signature):
+#     """You will receive an original question, last search query, and the retrieved context from the last search query.
+#     Write the next search query to help retrieve all relevant context to answer the original question.
+#     """
+
+#     context = dspy.InputField(desc="may contain relevant facts")
+#     question = dspy.InputField()
+#     last_search_query = dspy.InputField(desc="The last search query")
+#     query = dspy.OutputField()
+
+
+from dsp.utils import deduplicate
+
+
+class DsPyMultiHopRAGCycle(dspy.Module):
+    def __init__(self, passages_per_hop=2, max_hops=2):
+        super().__init__()
+
+        self.generate_query = dspy.ChainOfThought(GenerateSearchQuery)
+
+        self.retrieve = dspy.Retrieve(k=passages_per_hop)
+        self.generate_answer = dspy.ChainOfThought(GenerateAnswer)
+        self.max_hops = max_hops
+
+    def forward(self, question):
+        context = []
+        # last_query = None
+
+        for hop in range(self.max_hops):
+            query = self.generate_query(
+                context=context,
+                question=question,  # last_search_query=last_query
+            ).query
+            # last_query = query
+            passages = self.retrieve(query).passages
+            context = deduplicate(context + passages)
+
+        pred = self.generate_answer(context=context, question=question)
+        return dspy.Prediction(context=context, answer=pred.answer)
+
+
+# pred: Prediction
+
+
+def validate_answer(example, pred, trace=None):
+    evaluator = AnswerMatchAcc(type="exact_match")
+    print(f"pred: {pred.answer}, example: {example['answer']}")
+    return evaluator.compute_single_item(str(pred.answer), example["answer"])
+
+
+def train_MIPROv2(trainset, valset, save_path, filename):
+
+    import os
+    from dspy.teleprompt import MIPROv2
+
+    if not os.path.exists(save_path):
+        os.makedirs(save_path)
+
+    tp = MIPROv2(
+        metric=validate_answer,
+        prompt_model=gpt_4,
+        task_model=turbo,
+        num_candidates=12,
+        init_temperature=1.0,
+        log_dir=save_path,
+    )
+    compiled_task = tp.compile(
+        DsPyMultiHopRAGCycle(),
+        trainset=trainset,
+        valset=valset,
+        max_bootstrapped_demos=0,  # 2,
+        max_labeled_demos=0,  # 2,
+        num_batches=12,  # MINIBATCH_SIZE = 25, (eval on trainset)
+        seed=2025,
+        requires_permission_to_run=False,
+    )
+    compiled_task.save(os.path.join(save_path, filename))
+    return compiled_task
+
+
+# def train(trainset, save_path, filename):
+#     from dspy.teleprompt import BootstrapFewShot
+#     import os
+
+#     if not os.path.exists(save_path):
+#         os.makedirs(save_path)
+
+#     # teleprompter = BootstrapFewShot(metric=validate_context_and_answer_and_hops)
+#     teleprompter = BootstrapFewShot(metric=validate_answer)
+#     compiled_baleen = teleprompter.compile(
+#         SimplifiedBaleen(),
+#         teacher=SimplifiedBaleen(passages_per_hop=2),
+#         trainset=trainset,
+#     )
+#     turbo.inspect_history(n=3)
+#     compiled_baleen.save(os.path.join(save_path, filename))
+#     return compiled_baleen
+
+
+def validate(devset, compiled_baleen=None, uncompiled_baleen=None):
+    from dspy.evaluate.evaluate import Evaluate
+
+    # Define metric to check if we retrieved the correct documents
+    # def gold_passages_retrieved(example, pred, trace=None):
+    #     gold_titles = set(map(dspy.evaluate.normalize_text, example["gold_titles"]))
+    #     found_titles = set(
+    #         map(dspy.evaluate.normalize_text, [c.split(" | ")[0] for c in pred.context])
+    #     )
+    #     return gold_titles.issubset(found_titles)
+
+    # Set up the `evaluate_on_hotpotqa` function. We'll use this many times below.
+    evaluate_on_hotpotqa = Evaluate(
+        devset=devset,
+        num_threads=4,
+        display_progress=True,
+        display_table=5,
+        # metric=validate_answer,
+    )
+    if uncompiled_baleen is not None:
+        uncompiled_baleen_answer_score = evaluate_on_hotpotqa(
+            uncompiled_baleen, metric=validate_answer, display_progress=True
+        )
+        print(
+            f"## Answer Score for uncompiled Baleen: {uncompiled_baleen_answer_score}"
+        )
+
+    if compiled_baleen is None:
+        return
+
+    compiled_baleen_answer_score = evaluate_on_hotpotqa(
+        compiled_baleen, metric=validate_answer, display_progress=True
+    )
+    print(f"## Answer Score for compiled Baleen: {compiled_baleen_answer_score}")
+    return compiled_baleen_answer_score
+
+
+def train_and_validate():
+    import os
+
+    save_path = "benchmarks/hotpot_qa/dspy_multi_hop_rag_cycle_0_shot"
+    if not os.path.exists(save_path):
+        os.makedirs(save_path)
+
+    import time
+    import tqdm
+
+    dspy_trainset, dspy_valset, dspy_testset = load_datasets()
+
+    val_accs = []
+    test_accs = []
+    training_times = []
+    max_val_score = 0
+    max_test_score = 0
+
+    num_runs = 4
+
+    for i in tqdm.tqdm(range(num_runs)):
+        start = time.time()
+        output_file = f"compiled_count_{i}.json"
+
+        compiled_count = train_MIPROv2(
+            dspy_trainset, dspy_valset, save_path, output_file
+        )
+        val_acc = validate(dspy_valset, compiled_count)  # 46
+        test_acc = validate(dspy_testset, compiled_count)  # 52
+
+        val_accs.append(val_acc)
+        test_accs.append(test_acc)
+
+        training_times.append(time.time() - start)
+
+    # compute the mean and standard deviation
+    import numpy as np
+
+    val_accs = np.array(val_accs)
+    test_accs = np.array(test_accs)
+    training_times = np.array(training_times)
+    max_val_score = val_accs.max()
+    max_test_score = test_accs.max()
+    print("Validation accuracy:", val_accs.mean(), val_accs.std())
+    print("Test accuracy:", test_accs.mean(), test_accs.std())
+
+    print("Training time:", training_times.mean())
+    print("Max val score: ", max_val_score)
+    print("Max test score: ", max_test_score)
+
+
+if __name__ == "__main__":
+    from adalflow.utils import setup_env
+
+    setup_env()
+    # Ask any question you like to this simple RAG program.
+    my_question = "How many storeys are in the castle that David Gregory inherited?"
+
+    task = DsPyMultiHopRAGCycle()
+    trainset, valset, testset = load_datasets()
+
+    # pred = uncompiled_baleen(my_question)
+
+    # # Print the contexts and the answer.
+    # print(f"Question: {my_question}")
+    # print(f"Predicted Answer: {pred.answer}")
+    # print(f"Retrieved Contexts (truncated): {[c[:200] + '...' for c in pred.context]}")
+    # turbo.inspect_history(n=3)
+
+    # Load the datasets.
+    trainset, valset, testset = load_datasets()
+
+    validate(valset, None, task)
+    validate(testset, None, task)
+
+    train_and_validate()
+
+    # train the model
+    # compiled_baleen = train(trainset, dspy_save_path, "hotpotqa.json")
+    # validate(devset, compiled_baleen, uncompiled_baleen)
+
+    # with demos (2, 2)
+    # Validation accuracy: 46.75 5.11737237261468
+    # Test accuracy: 47.75 0.75
+    # Training time: 2091.4183588027954
+    # Max val score:  53.0
+    # Max test score:  48.5
+
+    # zero shot
+
+    # Validation accuracy: 41.25 4.815340071064556
+    # Test accuracy: 37.875 3.6464880364537056
+    # Training time: 1550.8598119020462
+    # Max val score:  45.0
+    # Max test score:  42.5
diff --git a/benchmarks/hotpot_qa/dspy_vanilla_rag.py b/benchmarks/hotpot_qa/dspy_vanilla_rag.py
new file mode 100644
index 00000000..4ab6ccee
--- /dev/null
+++ b/benchmarks/hotpot_qa/dspy_vanilla_rag.py
@@ -0,0 +1,252 @@
+"""
+Dspy's CoT is handled by itself with ChainOfThought. That is why our prompt (signature) does not need to include "think step by step".
+"""
+
+import dspy
+import dspy.evaluate
+
+from dspy import Example
+
+
+turbo = dspy.OpenAI(model="gpt-3.5-turbo-0125")
+gpt_4 = dspy.OpenAI(model="gpt-4o")
+
+colbertv2_wiki17_abstracts = dspy.ColBERTv2(
+    url="http://20.102.90.50:2017/wiki17_abstracts"
+)
+
+dspy.settings.configure(lm=turbo, rm=colbertv2_wiki17_abstracts)
+from adalflow.eval.answer_match_acc import AnswerMatchAcc
+
+
+def load_datasets():
+    from benchmarks.hotpot_qa.config import load_datasets
+
+    trainset, valset, testset = load_datasets()
+    # dspy requires us to package the dataset to Example objects and specify the inputs
+
+    dspy_trainset, dspy_valset, dspy_testset = [], [], []
+    for dataset in zip(
+        [trainset, valset, testset], [dspy_trainset, dspy_valset, dspy_testset]
+    ):
+        for item in dataset[0]:
+            example = Example(question=item.question, answer=item.answer)
+            example = example.with_inputs("question")
+            dataset[1].append(example)
+
+    return dspy_trainset, dspy_valset, dspy_testset
+
+
+class GenerateAnswer(dspy.Signature):
+    """Answer questions with short factoid answers."""
+
+    context = dspy.InputField(desc="may contain relevant facts")
+    question = dspy.InputField()
+    answer = dspy.OutputField(desc="answer to the question")
+
+
+class DsPyVanillaRAG(dspy.Module):
+    def __init__(self, passages_per_hop=3):
+        super().__init__()
+
+        self.retrieve = dspy.Retrieve(k=passages_per_hop)
+        self.generate_answer = dspy.ChainOfThought(GenerateAnswer)
+
+    def forward(self, question):
+
+        passages = self.retrieve(question).passages
+
+        pred = self.generate_answer(context=passages, question=question)
+        return dspy.Prediction(context=passages, answer=pred.answer)
+
+
+# pred: Prediction
+
+
+def validate_answer(example, pred, trace=None):
+    evaluator = AnswerMatchAcc(type="exact_match")
+    print(f"pred: {pred.answer}, example: {example['answer']}")
+    return evaluator.compute_single_item(str(pred.answer), example["answer"])
+
+
+def train_MIPROv2(trainset, valset, save_path, filename):
+
+    import os
+    from dspy.teleprompt import MIPROv2
+
+    if not os.path.exists(save_path):
+        os.makedirs(save_path)
+
+    tp = MIPROv2(
+        metric=validate_answer,
+        prompt_model=gpt_4,
+        task_model=turbo,
+        num_candidates=12,
+        init_temperature=1.0,
+        log_dir=save_path,
+    )
+    compiled_task = tp.compile(
+        DsPyVanillaRAG(),
+        trainset=trainset,
+        valset=valset,
+        max_bootstrapped_demos=2,  # 2,
+        max_labeled_demos=2,  # 2,
+        num_batches=12,  # MINIBATCH_SIZE = 25, (eval on trainset)
+        seed=2025,
+        requires_permission_to_run=False,
+    )
+    compiled_task.save(os.path.join(save_path, filename))
+    return compiled_task
+
+
+# def train(trainset, save_path, filename):
+#     from dspy.teleprompt import BootstrapFewShot
+#     import os
+
+#     if not os.path.exists(save_path):
+#         os.makedirs(save_path)
+
+#     # teleprompter = BootstrapFewShot(metric=validate_context_and_answer_and_hops)
+#     teleprompter = BootstrapFewShot(metric=validate_answer)
+#     compiled_baleen = teleprompter.compile(
+#         SimplifiedBaleen(),
+#         teacher=SimplifiedBaleen(passages_per_hop=2),
+#         trainset=trainset,
+#     )
+#     turbo.inspect_history(n=3)
+#     compiled_baleen.save(os.path.join(save_path, filename))
+#     return compiled_baleen
+
+
+def validate(devset, compiled_baleen=None, uncompiled_baleen=None):
+    from dspy.evaluate.evaluate import Evaluate
+
+    # Define metric to check if we retrieved the correct documents
+    # def gold_passages_retrieved(example, pred, trace=None):
+    #     gold_titles = set(map(dspy.evaluate.normalize_text, example["gold_titles"]))
+    #     found_titles = set(
+    #         map(dspy.evaluate.normalize_text, [c.split(" | ")[0] for c in pred.context])
+    #     )
+    #     return gold_titles.issubset(found_titles)
+
+    # Set up the `evaluate_on_hotpotqa` function. We'll use this many times below.
+    evaluate_on_hotpotqa = Evaluate(
+        devset=devset,
+        num_threads=4,
+        display_progress=True,
+        display_table=5,
+        # metric=validate_answer,
+    )
+    if uncompiled_baleen is not None:
+        uncompiled_baleen_answer_score = evaluate_on_hotpotqa(
+            uncompiled_baleen, metric=validate_answer, display_progress=True
+        )
+        print(
+            f"## Answer Score for uncompiled Baleen: {uncompiled_baleen_answer_score}"
+        )
+
+    if compiled_baleen is None:
+        return
+
+    compiled_baleen_answer_score = evaluate_on_hotpotqa(
+        compiled_baleen, metric=validate_answer, display_progress=True
+    )
+    print(f"## Answer Score for compiled Baleen: {compiled_baleen_answer_score}")
+    return compiled_baleen_answer_score
+
+
+def train_and_validate():
+    import os
+
+    save_path = "benchmarks/hotpot_qa/dspy_vanilla_rag_4_shot"
+    if not os.path.exists(save_path):
+        os.makedirs(save_path)
+
+    import time
+    import tqdm
+
+    dspy_trainset, dspy_valset, dspy_testset = load_datasets()
+
+    val_accs = []
+    test_accs = []
+    training_times = []
+    max_val_score = 0
+    max_test_score = 0
+
+    num_runs = 4
+
+    for i in tqdm.tqdm(range(num_runs)):
+        start = time.time()
+        output_file = f"compiled_count_{i}.json"
+
+        compiled_count = train_MIPROv2(
+            dspy_trainset, dspy_valset, save_path, output_file
+        )
+        val_acc = validate(dspy_valset, compiled_count)  # 46
+        test_acc = validate(dspy_testset, compiled_count)  # 52
+
+        val_accs.append(val_acc)
+        test_accs.append(test_acc)
+
+        training_times.append(time.time() - start)
+
+    # compute the mean and standard deviation
+    import numpy as np
+
+    val_accs = np.array(val_accs)
+    test_accs = np.array(test_accs)
+    training_times = np.array(training_times)
+    max_val_score = val_accs.max()
+    max_test_score = test_accs.max()
+    print("Validation accuracy:", val_accs.mean(), val_accs.std())
+    print("Test accuracy:", test_accs.mean(), test_accs.std())
+
+    print("Training time:", training_times.mean())
+    print("Max val score: ", max_val_score)
+    print("Max test score: ", max_test_score)
+
+
+if __name__ == "__main__":
+    from adalflow.utils import setup_env
+
+    setup_env()
+    # Ask any question you like to this simple RAG program.
+    my_question = "How many storeys are in the castle that David Gregory inherited?"
+
+    task = DsPyVanillaRAG()
+    trainset, valset, testset = load_datasets()
+
+    # pred = uncompiled_baleen(my_question)
+
+    # # Print the contexts and the answer.
+    # print(f"Question: {my_question}")
+    # print(f"Predicted Answer: {pred.answer}")
+    # print(f"Retrieved Contexts (truncated): {[c[:200] + '...' for c in pred.context]}")
+    # turbo.inspect_history(n=3)
+
+    # Load the datasets.
+    trainset, valset, testset = load_datasets()
+
+    validate(valset, None, task)  # 25%
+    validate(testset, None, task)  # 29.5%
+
+    train_and_validate()
+
+    # train the model
+    # compiled_baleen = train(trainset, dspy_save_path, "hotpotqa.json")
+    # validate(devset, compiled_baleen, uncompiled_baleen)
+
+    # 0 shot
+
+    # Validation accuracy: 26.25 2.8613807855648994
+    # Test accuracy: 29.75 2.1937410968480306
+    # Training time: 720.5469482541084
+    # Max val score:  30.0
+    # Max test score:  31.5
+
+    # 4 shot
+    # Validation accuracy: 40.5 2.29128784747792
+    # Test accuracy: 42.375 3.7811208655635435
+    # Training time: 706.7893784046173
+    # Max val score:  43.0
+    # Max test score:  47.5