feat: refactoring of the base_graph

VinciGit00 · VinciGit00 · commit 12a6c18f6ac2 · 2024-10-28T09:58:03.000+01:00
diff --git a/scrapegraphai/graphs/base_graph.py b/scrapegraphai/graphs/base_graph.py
@@ -98,21 +98,116 @@ def _set_conditional_node_edges(self):
                 except:
                     node.false_node_name = None
 
-    def _execute_standard(self, initial_state: dict) -> Tuple[dict, list]:
-        """
-        Executes the graph by traversing nodes starting from the 
-        entry point using the standard method.
+    def _get_node_by_name(self, node_name: str):
+        """Returns a node instance by its name."""
+        return next(node for node in self.nodes if node.node_name == node_name)
 
-        Args:
-            initial_state (dict): The initial state to pass to the entry point node.
+    def _update_source_info(self, current_node, state):
+        """Updates source type and source information from FetchNode."""
+        source_type = None
+        source = []
+        prompt = None
+        
+        if current_node.__class__.__name__ == "FetchNode":
+            source_type = list(state.keys())[1]
+            if state.get("user_prompt", None):
+                prompt = state["user_prompt"] if isinstance(state["user_prompt"], str) else None
+
+            if source_type == "local_dir":
+                source_type = "html_dir"
+            elif source_type == "url":
+                if isinstance(state[source_type], list):
+                    source.extend(url for url in state[source_type] if isinstance(url, str))
+                elif isinstance(state[source_type], str):
+                    source.append(state[source_type])
+
+        return source_type, source, prompt
+
+    def _get_model_info(self, current_node):
+        """Extracts LLM and embedder model information from the node."""
+        llm_model = None
+        llm_model_name = None
+        embedder_model = None
 
-        Returns:
-            Tuple[dict, list]: A tuple containing the final state and a list of execution info.
+        if hasattr(current_node, "llm_model"):
+            llm_model = current_node.llm_model
+            if hasattr(llm_model, "model_name"):
+                llm_model_name = llm_model.model_name
+            elif hasattr(llm_model, "model"):
+                llm_model_name = llm_model.model
+            elif hasattr(llm_model, "model_id"):
+                llm_model_name = llm_model.model_id
+
+        if hasattr(current_node, "embedder_model"):
+            embedder_model = current_node.embedder_model
+            if hasattr(embedder_model, "model_name"):
+                embedder_model = embedder_model.model_name
+            elif hasattr(embedder_model, "model"):
+                embedder_model = embedder_model.model
+
+        return llm_model, llm_model_name, embedder_model
+
+    def _get_schema(self, current_node):
+        """Extracts schema information from the node configuration."""
+        if not hasattr(current_node, "node_config"):
+            return None
+            
+        if not isinstance(current_node.node_config, dict):
+            return None
+            
+        schema_config = current_node.node_config.get("schema")
+        if not schema_config or isinstance(schema_config, dict):
+            return None
+            
+        try:
+            return schema_config.schema()
+        except Exception:
+            return None
+
+    def _execute_node(self, current_node, state, llm_model, llm_model_name):
+        """Executes a single node and returns execution information."""
+        curr_time = time.time()
+        
+        with self.callback_manager.exclusive_get_callback(llm_model, llm_model_name) as cb:
+            result = current_node.execute(state)
+            node_exec_time = time.time() - curr_time
+
+            cb_data = None
+            if cb is not None:
+                cb_data = {
+                    "node_name": current_node.node_name,
+                    "total_tokens": cb.total_tokens,
+                    "prompt_tokens": cb.prompt_tokens,
+                    "completion_tokens": cb.completion_tokens,
+                    "successful_requests": cb.successful_requests,
+                    "total_cost_USD": cb.total_cost,
+                    "exec_time": node_exec_time,
+                }
+
+        return result, node_exec_time, cb_data
+
+    def _get_next_node(self, current_node, result):
+        """Determines the next node to execute based on current node type and result."""
+        if current_node.node_type == "conditional_node":
+            node_names = {node.node_name for node in self.nodes}
+            if result in node_names:
+                return result
+            elif result is None:
+                return None
+            raise ValueError(
+                f"Conditional Node returned a node name '{result}' that does not exist in the graph"
+            )
+        
+        return self.edges.get(current_node.node_name)
+
+    def _execute_standard(self, initial_state: dict) -> Tuple[dict, list]:
+        """
+        Executes the graph by traversing nodes starting from the entry point using the standard method.
         """
         current_node_name = self.entry_point
         state = initial_state
-
-        # variables for tracking execution info
+        
+        # Tracking variables
         total_exec_time = 0.0
         exec_info = []
         cb_total = {
@@ -134,104 +229,51 @@ def _execute_standard(self, initial_state: dict) -> Tuple[dict, list]:
         schema = None
 
         while current_node_name:
-            curr_time = time.time()
-            current_node = next(node for node in self.nodes if node.node_name == current_node_name)
-
-            if current_node.__class__.__name__ == "FetchNode":
-                source_type = list(state.keys())[1]
-                if state.get("user_prompt", None):
-                    prompt = state["user_prompt"] if isinstance(state["user_prompt"], str) else None
-
-                if source_type == "local_dir":
-                    source_type = "html_dir"
-                elif source_type == "url":
-                    if isinstance(state[source_type], list):
-                        for url in state[source_type]:
-                            if isinstance(url, str):
-                                source.append(url)
-                    elif isinstance(state[source_type], str):
-                        source.append(state[source_type])
-
-            if hasattr(current_node, "llm_model") and llm_model is None:
-                llm_model = current_node.llm_model
-                if hasattr(llm_model, "model_name"):
-                    llm_model_name = llm_model.model_name
-                elif hasattr(llm_model, "model"):
-                    llm_model_name = llm_model.model
-                elif hasattr(llm_model, "model_id"):
-                    llm_model_name = llm_model.model_id
-
-            if hasattr(current_node, "embedder_model") and embedder_model is None:
-                embedder_model = current_node.embedder_model
-                if hasattr(embedder_model, "model_name"):
-                    embedder_model = embedder_model.model_name
-                elif hasattr(embedder_model, "model"):
-                    embedder_model = embedder_model.model
-
-            if hasattr(current_node, "node_config"):
-                if isinstance(current_node.node_config,dict):
-                    if current_node.node_config.get("schema", None) and schema is None:
-                        if not  isinstance(current_node.node_config["schema"], dict):
-                            try:
-                                schema = current_node.node_config["schema"].schema()
-                            except Exception as e:
-                                schema = None
-
-            with self.callback_manager.exclusive_get_callback(llm_model, llm_model_name) as cb:
-                try:
-                    result = current_node.execute(state)
-                except Exception as e:
-                    error_node = current_node.node_name
-                    graph_execution_time = time.time() - start_time
-                    log_graph_execution(
-                        graph_name=self.graph_name,
-                        source=source,
-                        prompt=prompt,
-                        schema=schema,
-                        llm_model=llm_model_name,
-                        embedder_model=embedder_model,
-                        source_type=source_type,
-                        execution_time=graph_execution_time,
-                        error_node=error_node,
-                        exception=str(e)
-                    )
-                    raise e
-                node_exec_time = time.time() - curr_time
+            current_node = self._get_node_by_name(current_node_name)
+            
+            # Update source information if needed
+            if source_type is None:
+                source_type, source, prompt = self._update_source_info(current_node, state)
+            
+            # Get model information if needed
+            if llm_model is None:
+                llm_model, llm_model_name, embedder_model = self._get_model_info(current_node)
+            
+            # Get schema if needed
+            if schema is None:
+                schema = self._get_schema(current_node)
+
+            try:
+                result, node_exec_time, cb_data = self._execute_node(
+                    current_node, state, llm_model, llm_model_name
+                )
                 total_exec_time += node_exec_time
 
-                if cb is not None:
-                    cb_data = {
-                        "node_name": current_node.node_name,
-                        "total_tokens": cb.total_tokens,
-                        "prompt_tokens": cb.prompt_tokens,
-                        "completion_tokens": cb.completion_tokens,
-                        "successful_requests": cb.successful_requests,
-                        "total_cost_USD": cb.total_cost,
-                        "exec_time": node_exec_time,
-                    }
-
+                if cb_data:
                     exec_info.append(cb_data)
-
-                    cb_total["total_tokens"] += cb_data["total_tokens"]
-                    cb_total["prompt_tokens"] += cb_data["prompt_tokens"]
-                    cb_total["completion_tokens"] += cb_data["completion_tokens"]
-                    cb_total["successful_requests"] += cb_data["successful_requests"]
-                    cb_total["total_cost_USD"] += cb_data["total_cost_USD"]
-
-            if current_node.node_type == "conditional_node":
-                node_names = {node.node_name for node in self.nodes}
-                if result in node_names:
-                    current_node_name = result
-                elif result is None:
-                    current_node_name = None
-                else:
-                    raise ValueError(f"Conditional Node returned a node name '{result}' that does not exist in the graph")
-                
-            elif current_node_name in self.edges:
-                current_node_name = self.edges[current_node_name]
-            else:
-                current_node_name = None
-
+                    for key in cb_total:
+                        cb_total[key] += cb_data[key]
+
+                current_node_name = self._get_next_node(current_node, result)
+
+            except Exception as e:
+                error_node = current_node.node_name
+                graph_execution_time = time.time() - start_time
+                log_graph_execution(
+                    graph_name=self.graph_name,
+                    source=source,
+                    prompt=prompt,
+                    schema=schema,
+                    llm_model=llm_model_name,
+                    embedder_model=embedder_model,
+                    source_type=source_type,
+                    execution_time=graph_execution_time,
+                    error_node=error_node,
+                    exception=str(e)
+                )
+                raise e
+
+        # Add total results to execution info
         exec_info.append({
             "node_name": "TOTAL RESULT",
             "total_tokens": cb_total["total_tokens"],
@@ -242,6 +284,7 @@ def _execute_standard(self, initial_state: dict) -> Tuple[dict, list]:
             "exec_time": total_exec_time,
         })
 
+        # Log final execution results
         graph_execution_time = time.time() - start_time
         response = state.get("answer", None) if source_type == "url" else None
         content = state.get("parsed_doc", None) if response is not None else None
@@ -300,3 +343,4 @@ def append_node(self, node):
         self.raw_edges.append((last_node, node))
         self.nodes.append(node)
         self.edges = self._create_edges({e for e in self.raw_edges})
+