diff --git a/adalflow/adalflow/components/agent/react.py b/adalflow/adalflow/components/agent/react.py
index f6336334..abe0d78d 100644
--- a/adalflow/adalflow/components/agent/react.py
+++ b/adalflow/adalflow/components/agent/react.py
@@ -22,6 +22,7 @@
     FunctionOutput,
     FunctionExpression,
 )
+from adalflow.optim.grad_component import fun_to_grad_component
 from adalflow.core.model_client import ModelClient
 from adalflow.utils.logger import printc
 
@@ -159,7 +160,6 @@ def call(
         step_output.function = func
 
         step_output.observation = result.output
-
         return step_output
 
 
@@ -302,13 +302,7 @@ def _init_tools(
         model_client: ModelClient,
         model_kwargs: Dict,
     ):
-        r"""Initialize the tools. Using copy or not can impact the status of tools depending on the tools' status."""
-        # try:
-        #     tools = [deepcopy(tool) for tool in tools]
-        # except Exception:
-        #     from copy import copy
-
-        #     tools = [copy(tool) for tool in tools]
+        r"""Initialize the tools. Using reference or else(copy or deepcopy) we can not set the training/eval mode for each tool."""
 
         tools = tools
         _additional_llm_tool = (
@@ -344,32 +338,30 @@ def finish(answer: str, **kwargs) -> str:
 
     def _execute_action(
         self,
-        action_step: StepOutput,
+        step_output: StepOutput,
         response: Union[Parameter, GeneratorOutput],
         id: Optional[str] = None,
     ) -> Optional[StepOutput]:
-        """Parse the action string to a function call and execute it. Update the action_step with the result."""
-        # extract the action from the response
+        """Parse the action string to a function call and execute it. Update the step_output with the result."""
 
-        if isinstance(response, Parameter):
+        def handle_error(response: Parameter, e: str):
 
-            def handle_error(response: Parameter, e: str):
-                from adalflow.optim.grad_component import fun_to_grad_component
+            @fun_to_grad_component
+            def set_step_output_with_error(
+                step_output: StepOutput, error: str, response: Any
+            ):
+                """Set the step_output with error."""
+                step_output.observation = f"erro: {error} at {response.data}"
+                return step_output
 
-                print(f"action_step: {action_step}")
+            response.add_successor_map_fn(
+                successor=set_step_output_with_error, map_fn=lambda x: x.data
+            )
+            return set_step_output_with_error.forward(step_output, e, response)
 
-                @fun_to_grad_component
-                def set_step_output_with_error(
-                    step_output: StepOutput, error: str, response: Any
-                ):
-                    """Set the step_output with error."""
-                    step_output.observation = f"erro: {error} at {response.data}"
-                    return step_output
+        step = step_output.step
 
-                response.add_successor_map_fn(
-                    successor=set_step_output_with_error, map_fn=lambda x: x.data
-                )
-                return set_step_output_with_error.forward(action_step, e, response)
+        if isinstance(response, Parameter):
 
             try:
                 function_output_to_step_output = FunctionOutputToStepOutput()
@@ -379,15 +371,13 @@ def set_step_output_with_error(
                     expr_or_fun=response, step="parse", map_fn=lambda x: x.data.data
                 )
                 # add action to the step_output
-                action_step.action = response.data.data
+                step_output.action = response.data.data
                 # parse failed
                 if not isinstance(func, Parameter):
                     raise ValueError(
                         f"Expected Parameter, but got {type(func)}: {func}"
                     )
                 if isinstance(func, str):
-                    # create dummy step output
-                    from adalflow.optim.grad_component import fun_to_grad_component
 
                     @fun_to_grad_component
                     def set_step_output_with_error(
@@ -401,10 +391,10 @@ def set_step_output_with_error(
                         successor=set_step_output_with_error,
                         map_fn=lambda x: x.data.data,
                     )
-                    action_step = set_step_output_with_error.forward(
-                        action_step, response, error=func
+                    step_output = set_step_output_with_error.forward(
+                        step_output, response, error=func
                     )
-                    return action_step
+                    return step_output
 
             except Exception as e:
                 e = f"{e} at parsing error at functionexpression: {response.data}"
@@ -426,7 +416,6 @@ def set_step_output_with_error(
 
                 if isinstance(result, str):
                     # create dummy step output
-                    from adalflow.optim.grad_component import fun_to_grad_component
 
                     @fun_to_grad_component
                     def set_step_output_with_error(step_output: StepOutput, data: str):
@@ -439,11 +428,11 @@ def set_step_output_with_error(step_output: StepOutput, data: str):
                         successor=set_step_output_with_error,
                         map_fn=lambda x: x.data.data,
                     )
-                    action_step = set_step_output_with_error.forward(
-                        action_step, response
+                    step_output = set_step_output_with_error.forward(
+                        step_output, response
                     )
 
-                    return action_step
+                    return step_output
 
             except Exception as e:
                 e = f"{e} Error executing function: {func}"
@@ -461,14 +450,14 @@ def set_step_output_with_error(step_output: StepOutput, data: str):
                 func.add_successor_map_fn(
                     successor=function_output_to_step_output, map_fn=lambda x: x.data
                 )
-                action_step = function_output_to_step_output.forward(
+                step_output = function_output_to_step_output.forward(
                     action_str=response,
-                    step=action_step.step,
+                    step=step,
                     result=result,
                     func=func,
                 )
 
-                return action_step
+                return step_output
             except Exception as e:
                 e = f"{e} Error converting function output to step output: {result.data}"
 
@@ -478,8 +467,8 @@ def set_step_output_with_error(step_output: StepOutput, data: str):
 
             return self._execute_action_eval_mode(
                 x=response,
-                step_output=action_step,
-                step=action_step.step,
+                step_output=step_output,
+                step=step,
                 id=id,
             )
 
@@ -548,14 +537,14 @@ def _run_one_step(
         step_history_value = (
             step_history.data if isinstance(step_history, Parameter) else step_history
         )
-        for step in step_history_value:
-            if not step:
+        for data in step_history_value:
+            if not data:
                 raise ValueError(
-                    f"Expected StepOutput, but got {type(step)}, all steps: {step_history_value}"
+                    f"Expected StepOutput, but got {type(data)}, all steps: {step_history_value}"
                 )
-            if not isinstance(step, StepOutput):
+            if not isinstance(data, StepOutput):
                 raise ValueError(
-                    f"Expected StepOutput, but got {type(step)}, all steps: {step_history_value}"
+                    f"Expected StepOutput, but got {type(data)}, all steps: {step_history_value}"
                 )
 
         log.debug(
@@ -566,11 +555,6 @@ def _run_one_step(
             response: Union[GeneratorOutput, Parameter] = self.planner(
                 prompt_kwargs=prompt_kwargs, model_kwargs=model_kwargs, id=id
             )
-        # except Exception as e:
-        #     error_msg = f"Error happened in planner response: {e}. Training mode: {self.planner.training}"
-        #     raise ValueError(
-        #         error_msg
-        #     )  # raise the error for debugging as this should not happen in normal cases.
 
         except Exception as e:
             error_msg = f"Error happened in planner response at step {step}: {e}.\n"
@@ -580,7 +564,6 @@ def _run_one_step(
             error_msg += f"Traceback:\n{traceback.format_exc()}"
             raise RuntimeError(error_msg)
 
-        # create a new step output
         step_output: StepOutput = StepOutput(step=step)
 
         try:
@@ -591,9 +574,8 @@ def _run_one_step(
                     raise ValueError(
                         f"Expected GeneratorOutput, but got {type(response.data)}, value: {response.data}"
                     )
-
+                # Detect planner parsing errors to FunctionExpression so that the prompt can be trained to self-correct
                 if not isinstance(response.data.data, FunctionExpression):
-                    from adalflow.optim.grad_component import fun_to_grad_component
 
                     @fun_to_grad_component
                     def set_step_output_with_error(
@@ -616,13 +598,14 @@ def set_step_output_with_error(
                     step_output: Parameter = self._execute_action(
                         step_output, response, id
                     )
-
-                # printc(f"step_output: {step_output}", color="red")
+                if self.debug:
+                    printc(f"step_output: {step_output.data}", color="red")
                 if not isinstance(step_output, Parameter):
                     raise ValueError(
                         f"Ensure step_output to be Parameter at training mode. Got {type(step_output)}.\n\
                             Please check the observation for error details: {step_output}"
                     )
+                # combine the current step_output with the step_history
                 step_output.add_successor_map_fn(
                     successor=self.append_step_history, map_fn=lambda x: x.data
                 )
@@ -646,7 +629,7 @@ def set_step_output_with_error(
             else:
 
                 step_output: StepOutput = self._execute_action(
-                    action_step=step_output, response=response, id=id
+                    step_output=step_output, response=response, id=id
                 )
                 if not step_output:
                     raise RuntimeError(
@@ -740,7 +723,6 @@ def bicall(
         step_history = None
 
         if self.training:
-
             step_history = Parameter(
                 data=[],
                 param_type=ParameterType.INPUT,
@@ -826,7 +808,7 @@ def forward(
     # print(OutputParameter.__mro__)
 
     app = App()
-    app.train()
+    app.eval()
     output = app("I want to multiply 3 and 4.", id="123")
     print(output)
-    output.draw_graph()
+    # output.draw_graph()
diff --git a/adalflow/adalflow/core/base_data_class.py b/adalflow/adalflow/core/base_data_class.py
index 1a379724..543a1090 100644
--- a/adalflow/adalflow/core/base_data_class.py
+++ b/adalflow/adalflow/core/base_data_class.py
@@ -292,7 +292,9 @@ class TrecDataList(DataClass):
             # {'data': [{'question': 'What is the capital of France?'}]}
         """
         if not is_dataclass(self):
-            raise ValueError("to_dict() called on a class type, not an instance.")
+            raise ValueError(
+                f"to_dict() is not called on a dataclass instance: {self.__class__}. You might forget to use @dataclass decorator."
+            )
         # convert all fields to its data if its parameter
         fields = self.__dataclass_fields__
         from adalflow.optim.parameter import Parameter
diff --git a/adalflow/adalflow/core/func_tool.py b/adalflow/adalflow/core/func_tool.py
index 7c5e0f1d..6524ad7d 100644
--- a/adalflow/adalflow/core/func_tool.py
+++ b/adalflow/adalflow/core/func_tool.py
@@ -253,7 +253,6 @@ def sync_function_1():
                 raise ValueError(
                     f"FunctionTool {self.definition.func_name} is in eval mode, but the output is Parameter"
                 )
-            print("output is Parameter")
             output.data = FunctionOutput(
                 name=self.definition.func_name,
                 # raw_input={"args": args, "kwargs": kwargs},
diff --git a/adalflow/adalflow/core/tool_manager.py b/adalflow/adalflow/core/tool_manager.py
index 4ac17607..340fbfbb 100644
--- a/adalflow/adalflow/core/tool_manager.py
+++ b/adalflow/adalflow/core/tool_manager.py
@@ -77,12 +77,12 @@ def bicall(
         context: Dict[str, object] = {},
     ):
         if isinstance(func, Parameter):
-            printc(f"context: {context}", color="yellow")
+            # printc(f"context: {context}", color="yellow")
             func_data: Function = func.map_to_successor(self)
             if not isinstance(func_data, Function):
                 raise ValueError(f"Error parsing function expression: {func}")
             tool: FunctionTool = context[func_data.name]
-            print(f"tool training: {tool.training}")
+            # print(f"tool training: {tool.training}")
             output = tool.forward(*func_data.args, **func_data.kwargs)
 
             from adalflow.optim.grad_component import fun_to_grad_component
@@ -119,16 +119,16 @@ def call(self, expr: FunctionExpression, context: Dict[str, object]) -> Function
 
         expr_str = expr.action
         func_name, args, kwargs = parse_function_call_expr(expr_str, context)
-        printc(
-            f"func_name: {func_name}, args: {args}, kwargs: {kwargs}", color="yellow"
-        )
+        # printc(
+        #     f"func_name: {func_name}, args: {args}, kwargs: {kwargs}", color="yellow"
+        # )
         output = Function(
             name=func_name,
             args=args,
             kwargs=kwargs,
             thought=expr.thought,
         )
-        printc(f"output: {output}", color="yellow")
+        # printc(f"output: {output}", color="yellow")
         return output
 
 
@@ -231,9 +231,9 @@ def parse_func_expr(
 
                 func = FunctionExperssionToFunction()
                 expr.add_successor_map_fn(func, map_fn=map_fn)
-                print("FunctionExperssionToFunction")
+                # print("FunctionExperssionToFunction")
                 output = func.forward(expr, context=self.context)
-                print(f"output data: {output.data}")
+                # print(f"output data: {output.data}")
                 return output
 
             except Exception as e:
@@ -301,7 +301,6 @@ def forward(
         "Run a forward pass on the tool manager such as parsing function expression or executing function."
         if isinstance(expr_or_fun, Parameter):
             expr_or_fun_data = map_fn(expr_or_fun)
-            print(f"expr_or_fun_data: {expr_or_fun_data}")
             if step == "execute":
                 if isinstance(expr_or_fun_data, Function):
                     return self.execute_func(expr_or_fun, map_fn=map_fn)
@@ -311,9 +310,7 @@ def forward(
                     )
             else:
                 if isinstance(expr_or_fun_data, FunctionExpression):
-                    print(f"start parsing: {expr_or_fun_data}")
                     output = self.parse_func_expr(expr_or_fun, map_fn=map_fn)
-                    print(f"output 3: {output.data}")
                     return output
                 else:
                     raise NotImplementedError(
diff --git a/adalflow/adalflow/optim/grad_component.py b/adalflow/adalflow/optim/grad_component.py
index d306a508..c30f8b63 100644
--- a/adalflow/adalflow/optim/grad_component.py
+++ b/adalflow/adalflow/optim/grad_component.py
@@ -163,7 +163,6 @@ def backward(self, *, response: "Parameter", id: str = None, **kwargs):
 
         Subclass should implement this method if you need additional backward logic.
         """
-        from adalflow.optim.parameter import GradientContext
 
         log.info(f"GradComponent backward: {response.name}")
         children_params = response.predecessors
@@ -197,13 +196,14 @@ def backward(self, *, response: "Parameter", id: str = None, **kwargs):
                 grad.is_default_copy = (
                     True  # response and pred will keep the original gradient
                 )
-                grad.add_context(
-                    GradientContext(
-                        variable_desc=pred.role_desc,
-                        response_desc=response.name,
-                        input_output=f"""{response.component_trace.to_context_str()}""",
-                    )
-                )
+                # NOTE: test of keep the initial gradient context
+                # grad.add_context(
+                #     GradientContext(
+                #         variable_desc=pred.role_desc,
+                #         response_desc=response.name,
+                #         input_output=f"""{response.component_trace.to_context_str()}""",
+                #     )
+                # )
 
                 pred.add_gradient(grad)
 
diff --git a/adalflow/adalflow/optim/parameter.py b/adalflow/adalflow/optim/parameter.py
index 6984128e..b411f980 100644
--- a/adalflow/adalflow/optim/parameter.py
+++ b/adalflow/adalflow/optim/parameter.py
@@ -240,7 +240,7 @@ def __init__(
         self.data_type = type(data)
 
         self.set_eval_fn_input(eval_input=data)
-        self.gradients: List[Gradient] = []  # <FEEDBACK>gradient.data</FEEDBACK>
+        self.gradients: Set[Gradient] = set()
 
         self.grad_fn = None
 
@@ -301,12 +301,12 @@ def add_gradient(self, gradient: "Gradient"):
         start_order = len(self.gradients)
         gradient.order = start_order
 
-        self.gradients.append(gradient)
+        self.gradients.add(gradient)
         # sort the gradients by the data_id, response_component_id, and score
         self.sort_gradients()
 
     def reset_gradients(self):
-        self.gradients = []
+        self.gradients = set()
 
     def get_gradients_names(self) -> str:
         names = [g.name for g in self.gradients]
@@ -534,6 +534,8 @@ def sort_gradients(self):
                 x.score,
             ),
         )
+        # make it a set again
+        self.gradients = set(self.gradients)
 
     ############################################################################################################
     # Setters and getters
@@ -1738,24 +1740,44 @@ def update_from_to(self, from_response: "Parameter", to_pred: "Parameter"):
     def add_prompt(self, prompt: str):
         self.prompt = prompt
 
+    def __hash__(self):
+        # Use immutable and unique attributes to compute the hash
+        return hash((self.id, self.data_id, self.from_response_id, self.to_pred_id))
+
+    def __eq__(self, other):
+        # Ensure equality comparison is based on the same unique attributes
+        if not isinstance(other, Gradient):
+            return False
+        return (
+            self.id == other.id
+            and self.data_id == other.data_id
+            and self.from_response_id == other.from_response_id
+            and self.to_pred_id == other.to_pred_id
+        )
 
-# Move the gradients representation to this class.
-@dataclass
-class Gradients(DataClass):
-    gradients: List[Gradient] = field(
-        default_factory=list, metadata={"desc": "The list of gradients"}
-    )
-
-    def __init__(self, gradients: List[Gradient]):
-        self.gradients = gradients
 
-    def to_dict(self):
-        return {"gradients": [g.to_dict() for g in self.gradients]}
+if __name__ == "__main__":
 
-    @classmethod
-    def from_dict(cls, data: dict):
-        gradients = [Gradient.from_dict(g) for g in data["gradients"]]
-        return cls(gradients)
+    # test gradient hash and to_dict
+    from_response = OutputParameter(
+        name="p1",
+        role_desc="role1",
+        data=1,
+    )
+    from_response.component_trace = ComponentTrace(id="1")
+    g1 = Gradient(
+        from_response=from_response,
+        to_pred=Parameter(name="p2", role_desc="role2", data=2),
+        data_id="1",
+    )
+    g2 = Gradient(
+        from_response=from_response,
+        to_pred=Parameter(name="p2", role_desc="role2", data=2),
+        data_id="1",
+    )
+    print(g1 == g2)
+    print(g1.__hash__())
+    print(g2.__hash__())
+    print(isinstance(g1, Gradient))  # Should print True
 
-    def __repr__(self):
-        return f"Gradients(gradients={self.gradients})"
+    print(g1.to_dict())
diff --git a/benchmarks/hotpot_qa/adal_exp/build_multi_hop_rag.py b/benchmarks/hotpot_qa/adal_exp/build_multi_hop_rag.py
index 8e361f92..0c70f61b 100644
--- a/benchmarks/hotpot_qa/adal_exp/build_multi_hop_rag.py
+++ b/benchmarks/hotpot_qa/adal_exp/build_multi_hop_rag.py
@@ -612,7 +612,6 @@ def dspy_retriever_as_tool(
             r"""Retrieves the top k passages from using input as the query and save the documents in context_variables(Dict)'s context.
             Ensure you get all the context to answer the original question.
             """
-            print(f"training: {self.dspy_retriever.training}")
             output = self.dspy_retriever(input=input, id=id)
             parsed_output = output
             if isinstance(output, adal.Parameter):
diff --git a/benchmarks/hotpot_qa/adal_exp/train_agent_rag.py b/benchmarks/hotpot_qa/adal_exp/train_agent_rag.py
index 93bd1fc5..e350c815 100644
--- a/benchmarks/hotpot_qa/adal_exp/train_agent_rag.py
+++ b/benchmarks/hotpot_qa/adal_exp/train_agent_rag.py
@@ -161,7 +161,7 @@ def train(
     # )
 
     train(
-        debug=True,
+        debug=False,
         max_steps=12,
         # resume_from_ckpt="/Users/liyin/.adalflow/ckpt/AgenticRAGAdal/constrained_max_steps_4_dca7e_run_1.json",
     )
@@ -172,4 +172,5 @@ def train(
     # 0.72->0.74, 4 steps, 366s, /Users/liyin/.adalflow/ckpt/AgenticRAGAdal/constrained_max_steps_4_dca7e_run_1.json [Already faster, still lots to optimize]
 
     # 1246s, 12 steps, 0.8 val, /Users/liyin/.adalflow/ckpt/AgenticRAGAdal/constrained_max_steps_12_defe7_run_1.json
-    # v2149s, both gradients, 0.68 -> 0.78 /Users/liyin/.adalflow/ckpt/AgenticRAGAdal/constrained_max_steps_12_8a24a_run_1.json
+    # 2149s, both gradients, 0.68 -> 0.78 /Users/liyin/.adalflow/ckpt/AgenticRAGAdal/constrained_max_steps_12_8a24a_run_1.json
+    # /Users/liyin/.adalflow/ckpt/AgenticRAGAdal/constrained_max_steps_12_cdcb5_run_1.json 1728 s, 0.8