From 3edd895878d65ff155323bed36092328e53472f7 Mon Sep 17 00:00:00 2001
From: Alyssa Cote <46540273+AlyssaCote@users.noreply.github.com>
Date: Wed, 10 Apr 2024 09:08:00 -0700
Subject: [PATCH] Store SmartSim entity logs under the .smartsim directory
 (#532)

Moves .out and .err files under the `.smartsim` directory and creates a symlink to those files under the experiment directory.
[ committed by @AlyssaCote ]
[ reviewed by @ashao , @al-rigazzi ]
---
 doc/changelog.rst                    |   5 +
 smartsim/_core/control/controller.py |  39 ++++-
 smartsim/_core/launcher/step/step.py |  18 ++-
 tests/test_controller_errors.py      |   4 +
 tests/test_output_files.py           | 227 +++++++++++++++++++++++++++
 tests/test_slurm_settings.py         |   3 +
 6 files changed, 292 insertions(+), 4 deletions(-)
 create mode 100644 tests/test_output_files.py

diff --git a/doc/changelog.rst b/doc/changelog.rst
index ec22f7acd..167266fa3 100644
--- a/doc/changelog.rst
+++ b/doc/changelog.rst
@@ -18,6 +18,7 @@ To be released at some future point in time
 
 Description
 
+- Historical output files stored under .smartsim directory
 - Add option to build Torch backend without the Intel Math Kernel Library
 - Fix ReadTheDocs build issue
 - Promote device options to an Enum
@@ -37,6 +38,9 @@ Description
 
 Detailed Notes
 
+- The dashboard needs to display historical logs, so log files are written
+  out under the .smartsim directory and files under the experiment
+  directory are symlinked to them. (SmartSim-PR532_)
 - Add an option to smart build "--torch_with_mkl"/"--no_torch_with_mkl" to
   prevent Torch from trying to link in the Intel Math Kernel Library. This
   is needed because on machines that have the Intel compilers installed, the
@@ -89,6 +93,7 @@ Detailed Notes
   handler. SmartSim will now attempt to kill any launched jobs before calling
   the previously registered signal handler. (SmartSim-PR535_)
 
+.. _SmartSim-PR532: https://github.com/CrayLabs/SmartSim/pull/532
 .. _SmartSim-PR538: https://github.com/CrayLabs/SmartSim/pull/538
 .. _SmartSim-PR537: https://github.com/CrayLabs/SmartSim/pull/537
 .. _SmartSim-PR498: https://github.com/CrayLabs/SmartSim/pull/498
diff --git a/smartsim/_core/control/controller.py b/smartsim/_core/control/controller.py
index 989d66d2c..ae01e396c 100644
--- a/smartsim/_core/control/controller.py
+++ b/smartsim/_core/control/controller.py
@@ -355,6 +355,36 @@ def init_launcher(self, launcher: str) -> None:
         else:
             raise TypeError("Must provide a 'launcher' argument")
 
+    @staticmethod
+    def symlink_output_files(
+        job_step: Step, entity: t.Union[SmartSimEntity, EntitySequence[SmartSimEntity]]
+    ) -> None:
+        """Create symlinks for entity output files that point to the output files
+        under the .smartsim directory
+
+        :param job_step: Job step instance
+        :type job_step: Step
+        :param entity: Entity instance
+        :type entity: SmartSimEntity | EntitySequence[SmartSimEntity]
+        """
+        historical_out, historical_err = map(pathlib.Path, job_step.get_output_files())
+        entity_out = pathlib.Path(entity.path) / f"{entity.name}.out"
+        entity_err = pathlib.Path(entity.path) / f"{entity.name}.err"
+
+        # check if there is already a link to a previous run
+        if entity_out.is_symlink() or entity_err.is_symlink():
+            entity_out.unlink()
+            entity_err.unlink()
+
+        try:
+            entity_out.symlink_to(historical_out)
+            entity_err.symlink_to(historical_err)
+        except FileNotFoundError as fnf:
+            raise FileNotFoundError(
+                f"Output files for {entity.name} could not be found. "
+                "Symlinking files failed."
+            ) from fnf
+
     def _launch(
         self, exp_name: str, exp_path: str, manifest: Manifest
     ) -> LaunchedManifest[t.Tuple[str, Step]]:
@@ -470,6 +500,11 @@ def _launch_orchestrator(
             manifest_builder.add_database(
                 orchestrator, [(orc_batch_step.name, step) for step in substeps]
             )
+
+            # symlink substeps to maintain directory structure
+            for substep, substep_entity in zip(substeps, orchestrator.entities):
+                self.symlink_output_files(substep, substep_entity)
+
             self._launch_step(orc_batch_step, orchestrator)
 
         # if orchestrator was run on existing allocation, locally, or in allocation
@@ -537,6 +572,7 @@ def _launch_step(
         if completed_job is None and (
             entity.name not in self._jobs.jobs and entity.name not in self._jobs.db_jobs
         ):
+            self.symlink_output_files(job_step, entity)
             try:
                 job_id = self._launcher.run(job_step)
             except LauncherError as e:
@@ -548,6 +584,7 @@ def _launch_step(
         # if the completed job does exist and the entity passed in is the same
         # that has ran and completed, relaunch the entity.
         elif completed_job is not None and completed_job.entity is entity:
+            self.symlink_output_files(job_step, entity)
             try:
                 job_id = self._launcher.run(job_step)
             except LauncherError as e:
@@ -599,7 +636,7 @@ def _create_batch_job_step(
             entity_list.name, entity_list.path, entity_list.batch_settings
         )
         batch_step.meta["entity_type"] = str(type(entity_list).__name__).lower()
-        batch_step.meta["status_dir"] = str(telemetry_dir / entity_list.name)
+        batch_step.meta["status_dir"] = str(telemetry_dir)
 
         substeps = []
         for entity in entity_list.entities:
diff --git a/smartsim/_core/launcher/step/step.py b/smartsim/_core/launcher/step/step.py
index 2a4249105..deeef6b73 100644
--- a/smartsim/_core/launcher/step/step.py
+++ b/smartsim/_core/launcher/step/step.py
@@ -28,6 +28,7 @@
 
 import functools
 import os.path as osp
+import pathlib
 import sys
 import time
 import typing as t
@@ -66,10 +67,21 @@ def _create_unique_name(entity_name: str) -> str:
         step_name = entity_name + "-" + get_base_36_repr(time.time_ns())
         return step_name
 
+    @staticmethod
+    def _ensure_output_directory_exists(output_dir: str) -> None:
+        """Create the directory for the step output if it doesn't exist already"""
+        if not osp.exists(output_dir):
+            pathlib.Path(output_dir).mkdir(parents=True, exist_ok=True)
+
     def get_output_files(self) -> t.Tuple[str, str]:
-        """Return two paths to error and output files based on cwd"""
-        output = self.get_step_file(ending=".out")
-        error = self.get_step_file(ending=".err")
+        """Return two paths to error and output files based on metadata directory"""
+        try:
+            output_dir = self.meta["status_dir"]
+        except KeyError as exc:
+            raise KeyError("Status directory for this step has not been set.") from exc
+        self._ensure_output_directory_exists(output_dir)
+        output = osp.join(output_dir, f"{self.entity_name}.out")
+        error = osp.join(output_dir, f"{self.entity_name}.err")
         return output, error
 
     def get_step_file(
diff --git a/tests/test_controller_errors.py b/tests/test_controller_errors.py
index 043cd0906..f1e6fef9f 100644
--- a/tests/test_controller_errors.py
+++ b/tests/test_controller_errors.py
@@ -161,6 +161,8 @@ def test_restarting_entity(test_dir, wlmutils, entity):
     """Validate restarting a completed Model/Ensemble job"""
     step_settings = RunSettings("echo")
     step = MockStep("mock-step", test_dir, step_settings)
+    step.meta["status_dir"] = test_dir
+    entity.path = test_dir
     test_launcher = wlmutils.get_test_launcher()
     controller = Controller(test_launcher)
     controller._jobs.add_job(entity.name, job_id="1234", entity=entity)
@@ -172,6 +174,8 @@ def test_restarting_orch(test_dir, wlmutils):
     """Validate restarting a completed Orchestrator job"""
     step_settings = RunSettings("echo")
     step = MockStep("mock-step", test_dir, step_settings)
+    step.meta["status_dir"] = test_dir
+    orc.path = test_dir
     test_launcher = wlmutils.get_test_launcher()
     controller = Controller(test_launcher)
     controller._jobs.add_job(orc.name, job_id="1234", entity=orc)
diff --git a/tests/test_output_files.py b/tests/test_output_files.py
new file mode 100644
index 000000000..987fa9068
--- /dev/null
+++ b/tests/test_output_files.py
@@ -0,0 +1,227 @@
+# BSD 2-Clause License
+#
+# Copyright (c) 2021-2024, Hewlett Packard Enterprise
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+#    list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+#    this list of conditions and the following disclaimer in the documentation
+#    and/or other materials provided with the distribution.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import os
+import pathlib
+
+import pytest
+
+from smartsim import Experiment
+from smartsim._core.config import CONFIG
+from smartsim._core.control.controller import Controller
+from smartsim._core.launcher.step import Step
+from smartsim.database.orchestrator import Orchestrator
+from smartsim.entity.ensemble import Ensemble
+from smartsim.entity.model import Model
+from smartsim.settings.base import RunSettings
+from smartsim.settings.slurmSettings import SbatchSettings, SrunSettings
+
+controller = Controller()
+slurm_controller = Controller(launcher="slurm")
+
+rs = RunSettings("echo", ["spam", "eggs"])
+bs = SbatchSettings()
+batch_rs = SrunSettings("echo", ["spam", "eggs"])
+
+ens = Ensemble("ens", params={}, run_settings=rs, batch_settings=bs, replicas=3)
+orc = Orchestrator(db_nodes=3, batch=True, launcher="slurm", run_command="srun")
+model = Model("test_model", {}, "", rs)
+batch_model = Model("batch_test_model", {}, "", batch_rs, batch_settings=bs)
+
+
+def test_mutated_model_output(test_dir):
+    exp_name = "test-mutated-model-output"
+    exp = Experiment(exp_name, launcher="local", exp_path=test_dir)
+
+    test_model = exp.create_model("test_model", path=test_dir, run_settings=rs)
+    exp.generate(test_model)
+    exp.start(test_model, block=True)
+
+    assert pathlib.Path(test_model.path).exists()
+    assert pathlib.Path(test_model.path, f"{test_model.name}.out").is_symlink()
+    assert pathlib.Path(test_model.path, f"{test_model.name}.err").is_symlink()
+
+    with open(pathlib.Path(test_model.path, f"{test_model.name}.out"), "r") as file:
+        log_contents = file.read()
+
+    assert "spam eggs" in log_contents
+
+    first_link = os.readlink(pathlib.Path(test_model.path, f"{test_model.name}.out"))
+
+    test_model.run_settings.exe_args = ["hello", "world"]
+    exp.generate(test_model, overwrite=True)
+    exp.start(test_model, block=True)
+
+    assert pathlib.Path(test_model.path).exists()
+    assert pathlib.Path(test_model.path, f"{test_model.name}.out").is_symlink()
+    assert pathlib.Path(test_model.path, f"{test_model.name}.err").is_symlink()
+
+    with open(pathlib.Path(test_model.path, f"{test_model.name}.out"), "r") as file:
+        log_contents = file.read()
+
+    assert "hello world" in log_contents
+
+    second_link = os.readlink(pathlib.Path(test_model.path, f"{test_model.name}.out"))
+
+    with open(first_link, "r") as file:
+        first_historical_log = file.read()
+
+    assert "spam eggs" in first_historical_log
+
+    with open(second_link, "r") as file:
+        second_historical_log = file.read()
+
+    assert "hello world" in second_historical_log
+
+
+def test_get_output_files_with_create_job_step(test_dir):
+    """Testing output files through _create_job_step"""
+    exp_dir = pathlib.Path(test_dir)
+    status_dir = exp_dir / CONFIG.telemetry_subdir / model.type
+    step = controller._create_job_step(model, status_dir)
+    expected_out_path = status_dir / model.name / (model.name + ".out")
+    expected_err_path = status_dir / model.name / (model.name + ".err")
+    assert step.get_output_files() == (str(expected_out_path), str(expected_err_path))
+
+
+@pytest.mark.parametrize(
+    "entity",
+    [pytest.param(ens, id="ensemble"), pytest.param(orc, id="orchestrator")],
+)
+def test_get_output_files_with_create_batch_job_step(entity, test_dir):
+    """Testing output files through _create_batch_job_step"""
+    exp_dir = pathlib.Path(test_dir)
+    status_dir = exp_dir / CONFIG.telemetry_subdir / entity.type
+    batch_step, substeps = slurm_controller._create_batch_job_step(entity, status_dir)
+    for step in substeps:
+        # example output path for a member of an Ensemble is
+        # .smartsim/telemetry/Ensemble/ens/ens_0/ens_0.out
+        expected_out_path = (
+            status_dir / entity.name / step.entity_name / (step.entity_name + ".out")
+        )
+        expected_err_path = (
+            status_dir / entity.name / step.entity_name / (step.entity_name + ".err")
+        )
+        assert step.get_output_files() == (
+            str(expected_out_path),
+            str(expected_err_path),
+        )
+
+
+def test_model_get_output_files(test_dir):
+    """Testing model output files with manual step creation"""
+    exp_dir = pathlib.Path(test_dir)
+    step = Step(model.name, model.path, model.run_settings)
+    step.meta["status_dir"] = exp_dir / "output_dir"
+    expected_out_path = step.meta["status_dir"] / (model.name + ".out")
+    expected_err_path = step.meta["status_dir"] / (model.name + ".err")
+    assert step.get_output_files() == (str(expected_out_path), str(expected_err_path))
+
+
+def test_ensemble_get_output_files(test_dir):
+    """Testing ensemble output files with manual step creation"""
+    exp_dir = pathlib.Path(test_dir)
+    for member in ens.models:
+        step = Step(member.name, member.path, member.run_settings)
+        step.meta["status_dir"] = exp_dir / "output_dir"
+        expected_out_path = step.meta["status_dir"] / (member.name + ".out")
+        expected_err_path = step.meta["status_dir"] / (member.name + ".err")
+        assert step.get_output_files() == (
+            str(expected_out_path),
+            str(expected_err_path),
+        )
+
+
+def test_get_output_files_no_status_dir(test_dir):
+    """Test that a step not having a status directory throws a KeyError"""
+    step_settings = RunSettings("echo")
+    step = Step("mock-step", test_dir, step_settings)
+    with pytest.raises(KeyError):
+        out, err = step.get_output_files()
+
+
+@pytest.mark.parametrize(
+    "entity",
+    [pytest.param(ens, id="ensemble"), pytest.param(model, id="model")],
+)
+def test_symlink(test_dir, entity):
+    """Test symlinking historical output files"""
+    entity.path = test_dir
+    if entity.type == Ensemble:
+        for member in ens.models:
+            symlink_with_create_job_step(test_dir, member)
+    else:
+        symlink_with_create_job_step(test_dir, entity)
+
+
+def symlink_with_create_job_step(test_dir, entity):
+    """Function that helps cut down on repeated testing code"""
+    exp_dir = pathlib.Path(test_dir)
+    entity.path = test_dir
+    status_dir = exp_dir / CONFIG.telemetry_subdir / entity.type
+    step = controller._create_job_step(entity, status_dir)
+    controller.symlink_output_files(step, entity)
+    assert pathlib.Path(entity.path, f"{entity.name}.out").is_symlink()
+    assert pathlib.Path(entity.path, f"{entity.name}.err").is_symlink()
+    assert os.readlink(pathlib.Path(entity.path, f"{entity.name}.out")) == str(
+        status_dir / entity.name / (entity.name + ".out")
+    )
+    assert os.readlink(pathlib.Path(entity.path, f"{entity.name}.err")) == str(
+        status_dir / entity.name / (entity.name + ".err")
+    )
+
+
+@pytest.mark.parametrize(
+    "entity",
+    [pytest.param(ens, id="ensemble"), pytest.param(orc, id="orchestrator")],
+)
+def test_batch_symlink(entity, test_dir):
+    """Test symlinking historical output files"""
+    exp_dir = pathlib.Path(test_dir)
+    entity.path = test_dir
+    status_dir = exp_dir / CONFIG.telemetry_subdir / entity.type
+    batch_step, substeps = slurm_controller._create_batch_job_step(entity, status_dir)
+    for step in substeps:
+        slurm_controller.symlink_output_files(step, entity)
+        assert pathlib.Path(entity.path, f"{entity.name}.out").is_symlink()
+        assert pathlib.Path(entity.path, f"{entity.name}.err").is_symlink()
+        assert os.readlink(pathlib.Path(entity.path, f"{entity.name}.out")) == str(
+            status_dir / entity.name / step.entity_name / (step.entity_name + ".out")
+        )
+        assert os.readlink(pathlib.Path(entity.path, f"{entity.name}.err")) == str(
+            status_dir / entity.name / step.entity_name / (step.entity_name + ".err")
+        )
+
+
+def test_symlink_error(test_dir):
+    """Ensure FileNotFoundError is thrown"""
+    bad_model = Model(
+        "bad_model", {}, pathlib.Path(test_dir, "badpath"), RunSettings("echo")
+    )
+    telem_dir = pathlib.Path(test_dir, "bad_model_telemetry")
+    bad_step = controller._create_job_step(bad_model, telem_dir)
+    with pytest.raises(FileNotFoundError):
+        controller.symlink_output_files(bad_step, bad_model)
diff --git a/tests/test_slurm_settings.py b/tests/test_slurm_settings.py
index 51d4022dc..d9d820244 100644
--- a/tests/test_slurm_settings.py
+++ b/tests/test_slurm_settings.py
@@ -105,6 +105,7 @@ def test_mpmd_compound_env_exports():
 
     step = SrunStep("teststep", "./", srun)
 
+    step.meta["status_dir"] = ""
     launch_cmd = step.get_launch_cmd()
     env_cmds = [v for v in launch_cmd if v == "env"]
     assert "env" in launch_cmd and len(env_cmds) == 1
@@ -164,6 +165,7 @@ def test_mpmd_non_compound_env_exports():
 
     step = SrunStep("teststep", "./", srun)
 
+    step.meta["status_dir"] = ""
     launch_cmd = step.get_launch_cmd()
     env_cmds = [v for v in launch_cmd if v == "env"]
     assert "env" not in launch_cmd and len(env_cmds) == 0
@@ -223,6 +225,7 @@ def test_mpmd_non_compound_no_exports():
 
     step = SrunStep("teststep", "./", srun)
 
+    step.meta["status_dir"] = ""
     launch_cmd = step.get_launch_cmd()
     env_cmds = [v for v in launch_cmd if v == "env"]
     assert "env" not in launch_cmd and len(env_cmds) == 0