[patch] Persistent process (#476)

liamhuber · pyiron-runner · web-flow · commit c57d387476b5 · 2024-09-29T16:55:53.000-07:00
* Catch already-running children in Composite

* Add temporary results infrastructure

* Allow run results to be (de)serialized at run-time

By adding a new flag, having `Node.on_run` directly handle the serialization of results (making `_on_run` and `_run_args` new abstract methods that have the behaviour of the old public methods), and deserialize temporary results instead of running when a node is already running and such results exist

* Format black

* Remove redundant arg

* Save a checkpoint when running with result serialization

So that the graph gets saved with the serializer in a running state

* Refactor: extract method

* Catch the case that you re-run a node still waiting for its serialized result

* Add a helper method for resuming composite runs from broken processes

* Revert resume_from_broken_process

* Make cleaning revert to false

It's a bit messier for the filesystem, but for now let's default to keeping the data around

* Make the flag private

I want this functionality in, but I'm not at all happy with the UI, and don't totally trust edge cases (e.g. input changing under our feet), so let's put it in private for now in anticipation of changes

* Remove the checkpoint save

We'll get a recovery file when we close the parent process anyhow

* Extend doc

* Don't delete checkpoint as it's no longer written

* Refactor: slide

* Update HPC example notebook

With a real living example of `Node._serialize_result` working

---------

Co-authored-by: pyiron-runner &lt;pyiron@mpie.de&gt;
diff --git a/notebooks/deepdive.ipynb b/notebooks/deepdive.ipynb
@@ -3391,7 +3391,7 @@
       "  File \"/Users/huber/anaconda3/envs/pyiron_311/lib/python3.11/concurrent/futures/process.py\", line 261, in _process_worker\n",
       "    r = call_item.fn(*call_item.args, **call_item.kwargs)\n",
       "        ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n",
-      "  File \"/Users/huber/work/pyiron/pyiron_workflow/pyiron_workflow/nodes/function.py\", line 317, in on_run\n",
+      "  File \"/Users/huber/work/pyiron/pyiron_workflow/pyiron_workflow/nodes/function.py\", line 317, in _on_run\n",
       "    return self.node_function(**kwargs)\n",
       "           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n",
       "  File \"/Users/huber/work/pyiron/pyiron_workflow/pyiron_workflow/nodes/standard.py\", line 518, in Add\n",
diff --git a/notebooks/hpc_example.ipynb b/notebooks/hpc_example.ipynb
diff --git a/pyiron_workflow/node.py b/pyiron_workflow/node.py
@@ -7,11 +7,12 @@
 
 from __future__ import annotations
 
-from abc import ABC
+from abc import ABC, abstractmethod
 from concurrent.futures import Future
 from importlib import import_module
 from typing import Any, Literal, Optional, TYPE_CHECKING
 
+import cloudpickle
 from pyiron_snippets.colors import SeabornColors
 from pyiron_snippets.dotdict import DotDict
 
@@ -152,8 +153,8 @@ class Node(
 
     This is an abstract class.
     Children *must* define how :attr:`inputs` and :attr:`outputs` are constructed,
-    what will happen :meth:`on_run`, the :attr:`run_args` that will get passed to
-    :meth:`on_run`, and how to :meth:`process_run_result` once :meth:`on_run` finishes.
+    what will happen :meth:`_on_run`, the :attr:`run_args` that will get passed to
+    :meth:`_on_run`, and how to :meth:`process_run_result` once :meth:`_on_run` finishes.
     They may optionally add additional signal channels to the signals IO.
 
     Attributes:
@@ -192,6 +193,9 @@ class Node(
         autoload (Literal["pickle"] | StorageInterface | None): Whether to check
             for a matching saved node and what storage back end to use to do so (no
             auto-loading if the back end is `None`.)
+        _serialize_result (bool): (IN DEVELOPMENT) Cloudpickle the output of running
+            the node; this is useful if the run is happening in a parallel process and
+            the parent process may be killed before it is finished. (Default is False.)
         signals (pyiron_workflow.io.Signals): A container for input and output
             signals, which are channels for controlling execution flow. By default, has
             a :attr:`signals.inputs.run` channel which has a callback to the
@@ -218,7 +222,7 @@ class Node(
             its internal structure.
         execute: An alias for :meth:`run`, but with flags to run right here, right now,
             and with the input it currently has.
-        on_run: **Abstract.** Do the thing. What thing must be specified by child
+        _on_run: **Abstract.** Do the thing. What thing must be specified by child
             classes.
         pull: An alias for :meth:`run` that runs everything upstream, then runs this
             node (but doesn't fire off the `ran` signal, so nothing happens farther
@@ -227,7 +231,7 @@ class Node(
             object is encountered).
         replace_with: If the node belongs to a parent, attempts to replace itself in
             that parent with a new provided node.
-        run: Run the node function from :meth:`on_run`. Handles status automatically.
+        run: Run the node function from :meth:`_on_run`. Handles status automatically.
             Various execution options are available as boolean flags.
         set_input_values: Allows input channels' values to be updated without any
             running.
@@ -290,6 +294,10 @@ def __init__(
         )
         self.checkpoint = checkpoint
         self.recovery: Literal["pickle"] | StorageInterface | None = "pickle"
+        self._serialize_result = False  # Advertised, but private to indicate
+        # under-development status -- API may change to be more user-friendly
+        self._do_clean: bool = False  # Power-user override for cleaning up temporary
+        # serialized results and empty directories (or not).
         self._cached_inputs = None
         self._user_data = {}  # A place for power-users to bypass node-injection
 
@@ -373,6 +381,29 @@ def _readiness_error_message(self) -> str:
             f" conform to type hints.\n" + self.readiness_report
         )
 
+    def on_run(self, *args, **kwargs) -> Any:
+        save_result: bool = args[0]
+        args = args[1:]
+        result = self._on_run(*args, **kwargs)
+        if save_result:
+            self._temporary_result_pickle(result)
+        return result
+
+    @abstractmethod
+    def _on_run(self, *args, **kwargs) -> Any:
+        pass
+
+    @property
+    def run_args(self) -> tuple[tuple, dict]:
+        args, kwargs = self._run_args
+        args = (self._serialize_result,) + args
+        return args, kwargs
+
+    @property
+    @abstractmethod
+    def _run_args(self, *args, **kwargs) -> Any:
+        pass
+
     def run(
         self,
         *args,
@@ -431,6 +462,22 @@ def run(
             Kwargs updating input channel values happens _first_ and will get
             overwritten by any subsequent graph-based data manipulation.
         """
+        if self.running and self._serialize_result:
+            if self._temporary_result_file.is_file():
+                return self._finish_run(
+                    self._temporary_result_unpickle(),
+                    raise_run_exceptions=raise_run_exceptions,
+                    run_exception_kwargs={},
+                    run_finally_kwargs={
+                        "emit_ran_signal": emit_ran_signal,
+                        "raise_run_exceptions": raise_run_exceptions,
+                    },
+                )
+            else:
+                raise ValueError(
+                    f"{self.full_label} is still waiting for a serialized result"
+                )
+
         self.set_input_values(*args, **kwargs)
 
         return super().run(
@@ -520,6 +567,9 @@ def _run_finally(self, /, emit_ran_signal: bool, raise_run_exceptions: bool):
                 backend=self.recovery, filename=self.as_path().joinpath("recovery")
             )
 
+        if self._do_clean:
+            self._clean_graph_directory()
+
     def run_data_tree(self, run_parent_trees_too=False) -> None:
         """
         Use topological analysis to build a tree of all upstream dependencies and run
@@ -628,6 +678,21 @@ def cache_hit(self):
         except:
             return False
 
+    @property
+    def _temporary_result_file(self):
+        return self.as_path().joinpath("run_result.tmp")
+
+    def _temporary_result_pickle(self, results):
+        self._temporary_result_file.parent.mkdir(parents=True, exist_ok=True)
+        self._temporary_result_file.touch(exist_ok=False)
+        with self._temporary_result_file.open("wb") as f:
+            cloudpickle.dump(results, f)
+
+    def _temporary_result_unpickle(self):
+        with self._temporary_result_file.open("rb") as f:
+            results = cloudpickle.load(f)
+        return results
+
     def _outputs_to_run_return(self):
         return DotDict(self.outputs.to_value_dict())
 
@@ -994,6 +1059,22 @@ def report_import_readiness(self, tabs=0, report_so_far=""):
             f"{'ok' if self.import_ready else 'NOT IMPORTABLE'}"
         )
 
+    def _clean_graph_directory(self):
+        """
+        Delete the temporary results file (if any), and then go from this node's
+        semantic directory up to its semantic root's directory removing any empty
+        directories. Note: doesn't do a sophisticated walk, so sibling empty
+        directories will cause a parent to identify as non-empty.
+        """
+        self._temporary_result_file.unlink(missing_ok=True)
+
+        # Recursively remove empty directories
+        root_directory = self.semantic_root.as_path().parent
+        for parent in self._temporary_result_file.parents:
+            if parent == root_directory or not parent.exists() or any(parent.iterdir()):
+                break
+            parent.rmdir()
+
     def display_state(self, state=None, ignore_private=True):
         state = dict(self.__getstate__()) if state is None else state
         if self.parent is not None:
diff --git a/pyiron_workflow/nodes/composite.py b/pyiron_workflow/nodes/composite.py
@@ -141,16 +141,27 @@ def deactivate_strict_hints(self):
         for node in self:
             node.deactivate_strict_hints()
 
-    def on_run(self):
+    def _on_run(self):
         # Reset provenance and run status trackers
         self.provenance_by_execution = []
         self.provenance_by_completion = []
-        self.running_children = []
+        self.running_children = [n.label for n in self if n.running]
         self.signal_queue = []
 
-        for node in self.starting_nodes:
-            node.run()
+        if len(self.running_children) > 0:  # Start from a broken process
+            for label in self.running_children:
+                self.children[label].run()
+                # Running children will find serialized result and proceed,
+                # or raise an error because they're already running
+        else:  # Start fresh
+            for node in self.starting_nodes:
+                node.run()
 
+        self._run_while_children_or_signals_exist()
+
+        return self
+
+    def _run_while_children_or_signals_exist(self):
         errors = {}
         while len(self.running_children) > 0 or len(self.signal_queue) > 0:
             try:
@@ -172,8 +183,6 @@ def on_run(self):
                 f"{self.full_label} encountered multiple errors in children: {errors}"
             ) from None
 
-        return self
-
     def register_child_starting(self, child: Node) -> None:
         """
         To be called by children when they start their run cycle.
@@ -218,7 +227,7 @@ def register_child_emitting(self, child: Node) -> None:
                 self.signal_queue.append((firing, receiving))
 
     @property
-    def run_args(self) -> tuple[tuple, dict]:
+    def _run_args(self) -> tuple[tuple, dict]:
         return (), {}
 
     def process_run_result(self, run_output):
diff --git a/pyiron_workflow/nodes/for_loop.py b/pyiron_workflow/nodes/for_loop.py
@@ -236,9 +236,9 @@ def _setup_node(self) -> None:
         self.starting_nodes = input_nodes
         self._input_node_labels = tuple(n.label for n in input_nodes)
 
-    def on_run(self):
+    def _on_run(self):
         self._build_body()
-        return super().on_run()
+        return super()._on_run()
 
     def _build_body(self):
         """
diff --git a/pyiron_workflow/nodes/function.py b/pyiron_workflow/nodes/function.py
@@ -313,11 +313,11 @@ def _build_outputs_preview(cls) -> dict[str, Any]:
         return preview if len(preview) > 0 else {"None": type(None)}
         # If clause facilitates functions with no return value
 
-    def on_run(self, **kwargs):
+    def _on_run(self, **kwargs):
         return self.node_function(**kwargs)
 
     @property
-    def run_args(self) -> tuple[tuple, dict]:
+    def _run_args(self) -> tuple[tuple, dict]:
         kwargs = self.inputs.to_value_dict()
         return (), kwargs
 
diff --git a/pyiron_workflow/nodes/transform.py b/pyiron_workflow/nodes/transform.py
@@ -36,14 +36,14 @@ class FromManyInputs(Transformer, ABC):
 
     # _build_inputs_preview required from parent class
     # Inputs convert to `run_args` as a value dictionary
-    # This must be commensurate with the internal expectations of on_run
+    # This must be commensurate with the internal expectations of _on_run
 
     @abstractmethod
-    def on_run(self, **inputs_to_value_dict) -> Any:
+    def _on_run(self, **inputs_to_value_dict) -> Any:
         """Must take inputs kwargs"""
 
     @property
-    def run_args(self) -> tuple[tuple, dict]:
+    def _run_args(self) -> tuple[tuple, dict]:
         return (), self.inputs.to_value_dict()
 
     @classmethod
@@ -64,11 +64,11 @@ class ToManyOutputs(Transformer, ABC):
     # Must be commensurate with the dictionary returned by transform_to_output
 
     @abstractmethod
-    def on_run(self, input_object) -> callable[..., Any | tuple]:
+    def _on_run(self, input_object) -> callable[..., Any | tuple]:
         """Must take the single object to be transformed"""
 
     @property
-    def run_args(self) -> tuple[tuple, dict]:
+    def _run_args(self) -> tuple[tuple, dict]:
         return (self.inputs[self._input_name].value,), {}
 
     @classmethod
@@ -89,7 +89,7 @@ class InputsToList(_HasLength, FromManyInputs, ABC):
     _output_name: ClassVar[str] = "list"
     _output_type_hint: ClassVar[Any] = list
 
-    def on_run(self, **inputs_to_value_dict):
+    def _on_run(self, **inputs_to_value_dict):
         return list(inputs_to_value_dict.values())
 
     @classmethod
@@ -101,7 +101,7 @@ class ListToOutputs(_HasLength, ToManyOutputs, ABC):
     _input_name: ClassVar[str] = "list"
     _input_type_hint: ClassVar[Any] = list
 
-    def on_run(self, input_object: list):
+    def _on_run(self, input_object: list):
         return {f"item_{i}": v for i, v in enumerate(input_object)}
 
     @classmethod
@@ -184,7 +184,7 @@ class InputsToDict(FromManyInputs, ABC):
         list[str] | dict[str, tuple[Any | None, Any | NOT_DATA]]
     ]
 
-    def on_run(self, **inputs_to_value_dict):
+    def _on_run(self, **inputs_to_value_dict):
         return inputs_to_value_dict
 
     @classmethod
@@ -284,7 +284,7 @@ class InputsToDataframe(_HasLength, FromManyInputs, ABC):
     _output_name: ClassVar[str] = "df"
     _output_type_hint: ClassVar[Any] = DataFrame
 
-    def on_run(self, *rows: dict[str, Any]) -> Any:
+    def _on_run(self, *rows: dict[str, Any]) -> Any:
         df_dict = {}
         for i, row in enumerate(rows):
             for key, value in row.items():
@@ -295,7 +295,7 @@ def on_run(self, *rows: dict[str, Any]) -> Any:
         return DataFrame(df_dict)
 
     @property
-    def run_args(self) -> tuple[tuple, dict]:
+    def _run_args(self) -> tuple[tuple, dict]:
         return tuple(self.inputs.to_value_dict().values()), {}
 
     @classmethod
@@ -363,11 +363,11 @@ def _setup_node(self) -> None:
             ):
                 self.inputs[name] = self._dataclass_fields[name].default_factory()
 
-    def on_run(self, **inputs_to_value_dict):
+    def _on_run(self, **inputs_to_value_dict):
         return self.dataclass(**inputs_to_value_dict)
 
     @property
-    def run_args(self) -> tuple[tuple, dict]:
+    def _run_args(self) -> tuple[tuple, dict]:
         return (), self.inputs.to_value_dict()
 
     @classmethod
diff --git a/tests/unit/nodes/test_composite.py b/tests/unit/nodes/test_composite.py
@@ -603,6 +603,38 @@ def test_with_executor(self):
                 "retain its executor"
         )
 
+    def test_result_serialization(self):
+        """
+        This is actually only a useful feature if you have an executor which will
+        continue the process _after_ the parent python process has been shut down
+        (e.g. you sent the run code off to a slurm queue using `executorlib`.), but
+        we'll ensure that the plumbing works here by faking things a bit.
+        """
+        self.comp.use_cache = False
+
+        self.comp.child = Composite.create.function_node(plus_one, x=42)
+        self.comp.starting_nodes = [self.comp.child]
+
+        self.comp.child._serialize_result = True
+        self.comp.child.use_cache = False
+        self.comp.child._do_clean = False
+
+        out = self.comp.run()
+        self.assertTrue(self.comp.child._temporary_result_file.is_file())
+        self.assertEqual(self.comp.child.outputs.y.value, 42 + 1)
+
+        self.comp.child.running = True  # Fake it
+        self.comp.child._do_clean = True  # Clean up this time
+        self.comp.run()
+
+        self.assertFalse(self.comp.child._temporary_result_file.is_file())
+        self.assertEqual(self.comp.child.outputs.y.value, 42 + 1)
+        self.assertFalse(
+            self.comp.as_path().is_dir(),
+            msg="Actually, we expect cleanup to have removed empty directories up to "
+                "and including the semantic root's own directory"
+        )
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/tests/unit/test_node.py b/tests/unit/test_node.py