From 2ce639122ec895fa32c20b504209275013f1a6b1 Mon Sep 17 00:00:00 2001
From: Skander Moalla <skander.moalla@epfl.ch>
Date: Sun, 22 Dec 2024 04:19:24 +0100
Subject: [PATCH 1/9] Checkpointing

---
 README.md                                     |   3 +-
 reproducibility-scripts/template-sweep.yaml   |   8 +-
 .../configs/override/template_experiment.yaml |   2 +-
 src/template_package_name/configs/setup.yaml  |  20 ++-
 .../configs/template_experiment.yaml          |   2 +-
 .../template_experiment.py                    | 114 ++++++++++++++----
 src/template_package_name/utils/__init__.py   |   2 +-
 src/template_package_name/utils/config.py     |  90 ++++++++++++++
 src/template_package_name/utils/seeding.py    |  16 ++-
 template/README.md                            |   8 ++
 10 files changed, 231 insertions(+), 34 deletions(-)
 create mode 100644 src/template_package_name/utils/config.py

diff --git a/README.md b/README.md
index ec14612..464ec26 100644
--- a/README.md
+++ b/README.md
@@ -24,6 +24,7 @@ projects with hardware acceleration featuring:
   imports.
 - Experiment management, tracking, and sharing with [Hydra](https://hydra.cc/)
   and [Weights & Biases](https://wandb.ai/site).
+- Checkpointing setup for research experiments compatible with Weights & Biases.
 - Code quality with [pre-commit](https://pre-commit.com) hooks.
 
 The template makes collaboration and open-sourcing straightforward, avoiding setup issues and
@@ -39,7 +40,7 @@ or [this paper](https://github.com/CLAIRE-Labo/no-representation-no-trust) whose
 
 Follow this README to get started with the template.
 
-For a brief discussion of the template's design choices and a Q&A check `template/README.md` file.
+For a brief discussion of the template's design choices, features, and a Q&A check `template/README.md` file.
 
 ## Getting started with the template
 
diff --git a/reproducibility-scripts/template-sweep.yaml b/reproducibility-scripts/template-sweep.yaml
index 09eb18b..959fa3f 100644
--- a/reproducibility-scripts/template-sweep.yaml
+++ b/reproducibility-scripts/template-sweep.yaml
@@ -11,7 +11,13 @@ parameters:
   wandb.mode:
     value: online
   job_subdir:
-    value: my-tagged-experiment
+    value: some-special-experiment
+  seed:
+    value: 1
+  resuming.resume:
+    value: True
+  resuming.use_commit:
+    value: True
   some_number:
     values: [1, 2, 3]
 
diff --git a/src/template_package_name/configs/override/template_experiment.yaml b/src/template_package_name/configs/override/template_experiment.yaml
index 3fde7d9..8999c5b 100644
--- a/src/template_package_name/configs/override/template_experiment.yaml
+++ b/src/template_package_name/configs/override/template_experiment.yaml
@@ -1,4 +1,4 @@
 # @package _global_
 # The above line should appear in the override configs so that they sit at the root of the config tree.
 
-is_this_overridden: yes
+is_this_key_overridden: yes
diff --git a/src/template_package_name/configs/setup.yaml b/src/template_package_name/configs/setup.yaml
index 35b2a5c..307bb92 100644
--- a/src/template_package_name/configs/setup.yaml
+++ b/src/template_package_name/configs/setup.yaml
@@ -27,8 +27,8 @@ job_subdir: dev
 
 hydra:
   run:
-    # This is where the outputs of an individual run will be stored.
-    dir: outputs/${outputs_subdir}/${hydra.job.name}/${job_subdir}/${now:%Y-%m-%d_%H-%M-%S-%f}
+    # Finally, this is where the outputs of an individual run will be stored.
+    dir: outputs/${outputs_subdir}/${hydra.job.name}/${job_subdir}/${now:%Y-%m-%d--%H-%M-%S-%f}
   job:
     chdir: true
   verbose: false  # Set to true for logging at debug level.
@@ -42,3 +42,19 @@ wandb:
   anonymous: allow
   tags:
     - development
+  run_id: null
+
+run_dir: ${hydra:run.dir}
+resuming_dir: ${hydra:run.dir}
+
+resuming:
+  resume: False
+  use_commit: False
+  wandb_cache_bust: 0 # Limitation of wandb. Cannot create runs with the same ID if deleted previously.
+                      # Use this to refresh the id of the run and make it a "new" run.
+  exclude_keys: # Can be a deep key e.g. model.optimizer.lr
+    - data_dir
+    - outputs_dir
+    - run_dir
+    - resuming_dir
+    - wandb.run_id
diff --git a/src/template_package_name/configs/template_experiment.yaml b/src/template_package_name/configs/template_experiment.yaml
index 2b35641..f2823ab 100644
--- a/src/template_package_name/configs/template_experiment.yaml
+++ b/src/template_package_name/configs/template_experiment.yaml
@@ -13,4 +13,4 @@ defaults:
 
 some_arg: "some_default_value"
 some_number: 10
-is_this_overridden: no
+is_this_key_overridden: no
diff --git a/src/template_package_name/template_experiment.py b/src/template_package_name/template_experiment.py
index 607941f..9984a3e 100644
--- a/src/template_package_name/template_experiment.py
+++ b/src/template_package_name/template_experiment.py
@@ -2,55 +2,125 @@
 # Keep this, it's used as an example to run the code after a user installs the project.
 
 import logging
+import os
+import subprocess
+import sys
 from pathlib import Path
+from time import sleep
 
 import hydra
 import wandb
-from omegaconf import DictConfig, OmegaConf
+from omegaconf import DictConfig, OmegaConf, omegaconf
 
 from template_package_name import utils
 
+# Refers to utils for a description of resolvers
+utils.config.register_resolvers()
+
 # Hydra sets up the logger automatically.
 # https://hydra.cc/docs/tutorials/basic/running_your_app/logging/
 logger = logging.getLogger(__name__)
 
-# Resolvers can be used in the config files.
-# https://omegaconf.readthedocs.io/en/latest/custom_resolvers.html
-# They are useful when you want to make the default values of some config variables
-# result from direct computation of other config variables.
-# Only put variables meant to be edited by the user (as opposed to read-only variables described below)
-# and avoid making them too complicated, the point is not to write code in the config file.
-
-# Useful to evaluate expressions in the config file.
-OmegaConf.register_new_resolver("eval", eval, use_cache=True)
-# Generate a random seed and record it in the config of the experiment.
-OmegaConf.register_new_resolver(
-    "generate_random_seed", utils.seeding.generate_random_seed, use_cache=True
-)
-
 
 @hydra.main(version_base=None, config_path="configs", config_name="template_experiment")
 def main(config: DictConfig) -> None:
-    # Here you can make some computations with the config to add new keys, correct some values, etc.
-    # E.g., read-only variables that can be useful when navigating the experiments on wandb (filtering, sorting, etc.).
-    # Save the new config (as a file to record it) and pass it to wandb to record it with your experiment.
+    # The current working directory is a new directory unique to this run made by hydra, accessible by config.run_dir.
+    # A resuming directory uniquely identified by the config (and optionally the git sha)
+    # for storing checkpoints of the same experiment can be accessed via config.resuming.dir.
+    logger.info(f"Init directory: {Path.cwd()}")
+    resuming_dir, resuming_hash = utils.config.setup_resuming_dir(config)
+    logger.info(f"Run can be resumed from the directory: {resuming_dir}")
+    if config.resuming.resume:
+        os.chdir(resuming_dir)
+        logger.info(f"Resuming from the directory: {Path.cwd()}")
+        # You can still access the checkpoint directory for analysis etc even if not resuming with config.resuming_dir.
+
+    postprocess_and_save_config(config)
 
+    # If wandb.init hangs, it's likely that you're resuming a run that you already deleted on wandb.
+    # Increment config.resuming.wandb_cache_bust to start a new run.
+
+    # To resume a run in a sweep, find its wandb run id and pass it to the script alongside the same arguments
+    # the sweep agent started the run with.
+
+    wandb_run_id = config.wandb.run_id
+    if wandb_run_id is None:
+        if config.resuming.resume:
+            wandb_run_id = resuming_hash
     wandb.init(
-        config=OmegaConf.to_container(config, resolve=True, throw_on_missing=True),
+        id=wandb_run_id,
+        resume="allow" if config.resuming.resume else "never",
+        config=OmegaConf.to_container(config),
         project=config.wandb.project,
         tags=config.wandb.tags,
-        anonymous=config.wandb.anonymous,
         mode=config.wandb.mode,
+        anonymous=config.wandb.anonymous,
         dir=Path(config.wandb.dir).absolute(),
     )
 
+    # Use a custom step key when you log so that you can resume logging anywhere.
+    # For example, if the checkpoint is earlier than the last logged step in the crashed run, you can resume
+    # from steps already logged, and they will be rewritten (with the same value assuming reproducibility).
+    # E.g., wandb.log({"my_custom_step": i, "loss": loss})
+
+    # Re-log to capture log with wandb.
+    logger.info(f"Running command: {subprocess.list2cmdline(sys.argv)}")
+    logger.info(f"Init directory: {config.run_dir}")
     logger.info(f"Working directory: {Path.cwd()}")
-    logger.info(f"Running with config: \n{OmegaConf.to_yaml(config, resolve=True)}")
+    logger.info(f"Running with config: \n{OmegaConf.to_yaml(config)}")
+    if config.resuming.resume:
+        logger.info(f"Resuming from the directory: {Path.cwd()}")
 
     # Update this function whenever you have a library that needs to be seeded.
     utils.seeding.seed_everything(config)
 
-    wandb.log({"some_metric": config.some_number + 1})
+    # Example experiment
+    n = 100
+    # Loop from config.some_number*n to config.some_number*n + n and write 100 files to the disk.
+
+    # Attempt to resume
+    # Find the latest checkpoint of format file_{i}.txt
+    path = Path.cwd()
+    files = path.glob("file_*.txt")
+    files = sorted(files, key=lambda x: int(x.stem.split("_")[1]))
+    if files:
+        last_file = files[-1]
+        logger.info(f"Resuming from {last_file}")
+        j = int(last_file.stem.split("_")[1])
+    else:
+        j = config.some_number * n - 1
+
+    for i in range(j + 1, config.some_number * n + 100 + 1):
+        wandb.log({"iteration": i, "file_written": i, "some_metric": i})
+        print(i)
+        if i > config.some_number * n and i % 10 == 0:
+            with open(f"file_{i}.txt", "w") as f:
+                f.write(f"Hello world {i}!")
+                print(f"Checkpointing at {i}")
+
+        if i > config.some_number * n and i % 20 == 0:
+            # To test resuming.
+            raise ValueError("Crashing at 20")
+            pass
+        sleep(1)
+
+    logger.info("Finished writing files")
+
+
+def postprocess_and_save_config(config):
+    """Here you can make some computations with the config to add new keys, correct some values, etc.
+    E.g., read-only variables that can be useful when navigating the experiments on wandb
+     for filtering, sorting, etc.
+    Save the new config (as a file to record it) and pass it to wandb to record it with your experiment.
+    """
+    Path("config/").mkdir(exist_ok=True)
+    # Save if it doesn't exist otherwise (in case of resuming) assert that the config is the same.
+    utils.config.maybe_save_config(config, "config/config-before-postprocess.yaml")
+    with omegaconf.open_dict(config):
+        # Example of adding a new key to the config
+        config.some_new_key = "bar"
+    OmegaConf.resolve(config)
+    utils.config.maybe_save_config(config, "config/config-resolved.yaml")
 
 
 if __name__ == "__main__":
diff --git a/src/template_package_name/utils/__init__.py b/src/template_package_name/utils/__init__.py
index e865ca5..e4bb34f 100644
--- a/src/template_package_name/utils/__init__.py
+++ b/src/template_package_name/utils/__init__.py
@@ -1 +1 @@
-from template_package_name.utils import seeding
+from template_package_name.utils import config, seeding
diff --git a/src/template_package_name/utils/config.py b/src/template_package_name/utils/config.py
new file mode 100644
index 0000000..a88d2d1
--- /dev/null
+++ b/src/template_package_name/utils/config.py
@@ -0,0 +1,90 @@
+# Resolvers can be used in the config files.
+# https://omegaconf.readthedocs.io/en/latest/custom_resolvers.html
+# They are useful when you want to make the default values of some config variables
+# result from direct computation of other config variables.
+# Only put variables meant to be edited by the user (as opposed to read-only variables described below)
+# and avoid making them too complicated, the point is not to write code in the config file.
+import logging
+import subprocess
+from hashlib import blake2b
+from pathlib import Path
+
+from omegaconf import DictConfig, OmegaConf, omegaconf
+
+from template_package_name import utils
+
+# Hydra sets up the logger automatically.
+# https://hydra.cc/docs/tutorials/basic/running_your_app/logging/
+logger = logging.getLogger(__name__)
+
+
+def register_resolvers():
+    if not OmegaConf.has_resolver("eval"):
+        # Useful to evaluate expressions in the config file.
+        OmegaConf.register_new_resolver("eval", eval, use_cache=True)
+    if not OmegaConf.has_resolver("generate_random_seed"):
+        # Generate a random seed and record it in the config of the experiment.
+        OmegaConf.register_new_resolver(
+            "generate_random_seed", utils.seeding.generate_random_seed, use_cache=True
+        )
+
+
+def maybe_save_config(config, path):
+    """Save if it doesn't exist otherwise (in case of resuming) assert that the config is the same."""
+    if not Path(path).exists():
+        OmegaConf.save(config, path)
+    else:
+        new_config = config.copy()
+        remove_excluded_keys(new_config, config.resuming.exclude_keys)
+        existing_config = OmegaConf.load(path)
+        remove_excluded_keys(existing_config, config.resuming.exclude_keys)
+        try:
+            OmegaConf.resolve(new_config)
+            OmegaConf.resolve(existing_config)
+            assert new_config == existing_config
+        except AssertionError:
+            logger.error(f"Config to resume is different from the one saved in {path}")
+            raise AssertionError
+
+
+def remove_excluded_keys(config: DictConfig, exclude_keys: list[str]):
+    """Remove keys from the config that are specified in exclude_keys.
+    exclude_keys are of the form "key1.key2.key3" to remove the key3 from the key1.key2 dictionary.
+    """
+    with omegaconf.open_dict(config):
+        for key in exclude_keys:
+            keys = key.split(".")
+            val = config
+            for key_ in keys[:-1]:
+                val = val[key_]
+            del val[keys[-1]]
+
+
+def setup_resuming_dir(config):
+    """Create a unique identifier of the experiment used to specify a resuming/checkpoint directory.
+    The identifier is a hash of the config, excluding keys specified in config.resuming.exclude_keys.
+    If config.resuming.use_commit is True, the commit hash is appended to the identifier.
+    I.e. the checkpoint directory is defined by: the config - the excluded config keys + the commit hash (if specified)
+    """
+    resuming_hash = ""
+    config_to_hash = config.copy()
+
+    # resolve config
+    OmegaConf.resolve(config_to_hash)
+    remove_excluded_keys(config_to_hash, config.resuming.exclude_keys)
+    config_hash = blake2b(str(config_to_hash).encode(), digest_size=8).hexdigest()
+    resuming_hash += config_hash
+    if config.resuming.use_commit:
+        commit_hash = (
+            subprocess.check_output(["git", "rev-parse", "HEAD"])
+            .strip()
+            .decode("utf-8")
+        )
+        resuming_hash += f"-{commit_hash[:8]}"
+
+    resuming_dir = Path.cwd().parent / "checkpoints" / resuming_hash
+    resuming_dir.mkdir(parents=True, exist_ok=True)
+    with omegaconf.open_dict(config):
+        config.resuming_dir = str(resuming_dir)
+
+    return resuming_dir, resuming_hash
diff --git a/src/template_package_name/utils/seeding.py b/src/template_package_name/utils/seeding.py
index 14566c7..28f103c 100644
--- a/src/template_package_name/utils/seeding.py
+++ b/src/template_package_name/utils/seeding.py
@@ -11,18 +11,24 @@ def seed_everything(config):
     """Seed all random generators."""
     random.seed(config.seed)
 
-    # For numpy:
+    ## For numpy:
     # This is for legacy numpy:
     # np.random.seed(config.seed)
     # New code should make a Generator out of the config.seed directly:
     # https://numpy.org/doc/stable/reference/random/generated/numpy.random.seed.html
 
-    # For PyTorch:
+    ## For PyTorch:
     # torch.manual_seed(config.seed)
+
     # Higher (e.g., on CUDA too) reproducibility with deterministic algorithms:
     # https://pytorch.org/docs/stable/notes/randomness.html
-    # torch.backends.cudnn.benchmark = False
-    # torch.use_deterministic_algorithms(True)
-    # os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":4096:8"
+
     # Not supported for all operations though:
     # https://pytorch.org/docs/stable/generated/torch.use_deterministic_algorithms.html
+    # torch.use_deterministic_algorithms(True)
+
+    #  A lighter version of the above otherwise as not all algorithms have a deterministic implementation
+    # torch.backends.cudnn.deterministic = True
+
+    # torch.backends.cudnn.benchmark = False
+    # os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":4096:8"
diff --git a/template/README.md b/template/README.md
index da8bce7..44fdff8 100644
--- a/template/README.md
+++ b/template/README.md
@@ -16,6 +16,14 @@ This template ensures the reproducibility of your results through 3 artifacts:
     - Recorded and uploaded by you.
     - (Virtually) placed in the placeholder directories abstracting away the user storage system.
 
+## Checkpointing
+
+TODO
+
+### Compatibility with Weights & Biases
+
+TODO
+
 ## Template Q&A
 
 ### I started my project from an older version of the template, how do I get updates?

From 864dd3807a90a99ba96196aa34ff4baf25df4633 Mon Sep 17 00:00:00 2001
From: Skander Moalla <skander.moalla@epfl.ch>
Date: Sun, 22 Dec 2024 05:04:52 +0100
Subject: [PATCH 2/9] allow force resume

---
 src/template_package_name/configs/setup.yaml | 11 ++++++-----
 src/template_package_name/utils/config.py    |  3 +++
 2 files changed, 9 insertions(+), 5 deletions(-)

diff --git a/src/template_package_name/configs/setup.yaml b/src/template_package_name/configs/setup.yaml
index 307bb92..6cb1107 100644
--- a/src/template_package_name/configs/setup.yaml
+++ b/src/template_package_name/configs/setup.yaml
@@ -45,7 +45,7 @@ wandb:
   run_id: null
 
 run_dir: ${hydra:run.dir}
-resuming_dir: ${hydra:run.dir}
+resuming_dir: null
 
 resuming:
   resume: False
@@ -53,8 +53,9 @@ resuming:
   wandb_cache_bust: 0 # Limitation of wandb. Cannot create runs with the same ID if deleted previously.
                       # Use this to refresh the id of the run and make it a "new" run.
   exclude_keys: # Can be a deep key e.g. model.optimizer.lr
-    - data_dir
-    - outputs_dir
     - run_dir
-    - resuming_dir
-    - wandb.run_id
+    - data_dir      # To be able to resume by another user.
+    - outputs_dir   # To be able to resume by another user.
+    - resuming_dir  # To be able to force resume from anywhere.
+    - wandb         # To be able to move a run and resume it.
+    - resuming.exclude_keys # To be able to add keys on the fly and force resume.
diff --git a/src/template_package_name/utils/config.py b/src/template_package_name/utils/config.py
index a88d2d1..8b40c03 100644
--- a/src/template_package_name/utils/config.py
+++ b/src/template_package_name/utils/config.py
@@ -66,6 +66,9 @@ def setup_resuming_dir(config):
     If config.resuming.use_commit is True, the commit hash is appended to the identifier.
     I.e. the checkpoint directory is defined by: the config - the excluded config keys + the commit hash (if specified)
     """
+    if config.resuming_dir is not None:
+        return Path(config.resuming_dir), Path(config.resuming_dir).name
+
     resuming_hash = ""
     config_to_hash = config.copy()
 

From 69875010f0586bc9d17a948282c761fe72d34d93 Mon Sep 17 00:00:00 2001
From: Skander Moalla <skander.moalla@epfl.ch>
Date: Sun, 22 Dec 2024 05:18:53 +0100
Subject: [PATCH 3/9] Add docs

---
 template/README.md | 25 +++++++++++++++++++++++--
 1 file changed, 23 insertions(+), 2 deletions(-)

diff --git a/template/README.md b/template/README.md
index 44fdff8..9675e7a 100644
--- a/template/README.md
+++ b/template/README.md
@@ -18,11 +18,32 @@ This template ensures the reproducibility of your results through 3 artifacts:
 
 ## Checkpointing
 
-TODO
+The template provides an automatic setup of the checkpointing directory for an experiment.
+The unique identifier for the directory will be created by hashing the config file and optionally the git commit sha.
+Running the same experiment with the same config will thus set its working directory to the same checkpoint directory
+every time.
+
+To use this feature pass `resuming.resume=True` and `resuming.use_commit=True` to your script using a Hydra config
+that inherits form the `setup.yaml` config file like the `template_experiment.py` script.
+
+Even without using `resuming.use_commit=True`, the path to the checkpoint directory will be computed, and you could
+read from it, for example.
+
+You can also force a resuming directory by passing `resuming.resume_dir=<path>` to your script.
 
 ### Compatibility with Weights & Biases
 
-TODO
+For a non-sweep run, the run will have the id of the checkpoint directory as its wandb id, therefore your wandb run
+will stay the same and resume when your run is resumed.
+Make sure to use a custom step key when you log metrics so that you can have full control over when to start rewriting
+when you resume (E.g. if you checkpoint less often than you log, you may relog from the last checkpoint), otherwise
+the default step key of wandb will resume from the latest step and may be inconsistent with the checkpoint.
+
+For a sweep run, it already has an id from the sweep, so to resume it you should manually get its id and restart
+the script with the same arguments the sweep agent started it, this way the config and the
+checkpoint directory will be the same.
+This is a limitation of the wandb sweep system.
+See [this issue.](https://github.com/wandb/wandb/issues/9143)
 
 ## Template Q&A
 

From 26f85f061cc9cf2f64c26f7beba87171019b1ba2 Mon Sep 17 00:00:00 2001
From: Skander Moalla <skander.moalla@epfl.ch>
Date: Sun, 22 Dec 2024 17:08:03 +0100
Subject: [PATCH 4/9] docs

---
 src/template_package_name/template_experiment.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/src/template_package_name/template_experiment.py b/src/template_package_name/template_experiment.py
index 9984a3e..f561320 100644
--- a/src/template_package_name/template_experiment.py
+++ b/src/template_package_name/template_experiment.py
@@ -1,5 +1,6 @@
-# An example file to run an experiment.
-# Keep this, it's used as an example to run the code after a user installs the project.
+"""An example file to run an experiment.
+Keep this, it's used as an example to run the code after a user installs the project.
+"""
 
 import logging
 import os

From 79a09f6a168bb3bc1b60642577ce9d8536c56dbc Mon Sep 17 00:00:00 2001
From: Skander Moalla <skander.moalla@epfl.ch>
Date: Sun, 22 Dec 2024 17:26:39 +0100
Subject: [PATCH 5/9] change logic

---
 .../template_experiment.py                    | 24 ++++++++++++-------
 template/README.md                            | 13 +++++-----
 2 files changed, 22 insertions(+), 15 deletions(-)

diff --git a/src/template_package_name/template_experiment.py b/src/template_package_name/template_experiment.py
index f561320..3bb88fb 100644
--- a/src/template_package_name/template_experiment.py
+++ b/src/template_package_name/template_experiment.py
@@ -77,7 +77,7 @@ def main(config: DictConfig) -> None:
 
     # Example experiment
     n = 100
-    # Loop from config.some_number*n to config.some_number*n + n and write 100 files to the disk.
+    # Loop from 0 to 99 and write 100 files to the disk.
 
     # Attempt to resume
     # Find the latest checkpoint of format file_{i}.txt
@@ -87,21 +87,27 @@ def main(config: DictConfig) -> None:
     if files:
         last_file = files[-1]
         logger.info(f"Resuming from {last_file}")
-        j = int(last_file.stem.split("_")[1])
+        j = int(last_file.stem.split("_")[1]) % (config.some_number * n)
     else:
-        j = config.some_number * n - 1
-
-    for i in range(j + 1, config.some_number * n + 100 + 1):
-        wandb.log({"iteration": i, "file_written": i, "some_metric": i})
+        j = -1
+
+    for i in range(j + 1, 100):
+        wandb.log(
+            {
+                "iteration": i,
+                "file_written": i,
+                "some_metric": i + config.some_number * n,
+            }
+        )
         print(i)
-        if i > config.some_number * n and i % 10 == 0:
+        if i % 9 == 0:
             with open(f"file_{i}.txt", "w") as f:
                 f.write(f"Hello world {i}!")
                 print(f"Checkpointing at {i}")
 
-        if i > config.some_number * n and i % 20 == 0:
+        if i % 15 == 0:
             # To test resuming.
-            raise ValueError("Crashing at 20")
+            raise ValueError("Crashing at i % 15 = 0")
             pass
         sleep(1)
 
diff --git a/template/README.md b/template/README.md
index 9675e7a..cfae273 100644
--- a/template/README.md
+++ b/template/README.md
@@ -11,7 +11,7 @@ This template ensures the reproducibility of your results through 3 artifacts:
 2. The project code.
     - Recorded in the git repository that you keep up to date.
     - Made reproducible (to a desired degree) by you correctly seeding the random number generators and
-      optionally removing non-deterministic operations or replicable by running enough seeds.
+      optionally removing non-deterministic operations, or replicable by running enough seeds.
 3. The data, outputs, model weights and other artifacts.
     - Recorded and uploaded by you.
     - (Virtually) placed in the placeholder directories abstracting away the user storage system.
@@ -19,15 +19,15 @@ This template ensures the reproducibility of your results through 3 artifacts:
 ## Checkpointing
 
 The template provides an automatic setup of the checkpointing directory for an experiment.
-The unique identifier for the directory will be created by hashing the config file and optionally the git commit sha.
+The unique identifier for the directory is created by hashing the config used and optionally the git commit sha.
 Running the same experiment with the same config will thus set its working directory to the same checkpoint directory
-every time.
+every time (if the resuming option is enabled).
 
 To use this feature pass `resuming.resume=True` and `resuming.use_commit=True` to your script using a Hydra config
-that inherits form the `setup.yaml` config file like the `template_experiment.py` script.
+that inherits form the `setup.yaml` config file, like the `template_experiment.py` script.
 
 Even without using `resuming.use_commit=True`, the path to the checkpoint directory will be computed, and you could
-read from it, for example.
+for example, read from it.
 
 You can also force a resuming directory by passing `resuming.resume_dir=<path>` to your script.
 
@@ -41,7 +41,8 @@ the default step key of wandb will resume from the latest step and may be incons
 
 For a sweep run, it already has an id from the sweep, so to resume it you should manually get its id and restart
 the script with the same arguments the sweep agent started it, this way the config and the
-checkpoint directory will be the same.
+checkpoint directory will be the same
+(i.e. go to the wandb run UI, copy-paste the command it was run with and add `wandb.run_id=<id-of-the-run>`).
 This is a limitation of the wandb sweep system.
 See [this issue.](https://github.com/wandb/wandb/issues/9143)
 

From 66dbf264d0628e5696801d1dd59bedad46d6f9bf Mon Sep 17 00:00:00 2001
From: Skander Moalla <skander.moalla@epfl.ch>
Date: Sun, 22 Dec 2024 17:27:38 +0100
Subject: [PATCH 6/9] change logic

---
 src/template_package_name/template_experiment.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/template_package_name/template_experiment.py b/src/template_package_name/template_experiment.py
index 3bb88fb..c0f7837 100644
--- a/src/template_package_name/template_experiment.py
+++ b/src/template_package_name/template_experiment.py
@@ -102,7 +102,7 @@ def main(config: DictConfig) -> None:
         print(i)
         if i % 9 == 0:
             with open(f"file_{i}.txt", "w") as f:
-                f.write(f"Hello world {i}!")
+                f.write(f"some_metric={i + config.some_number * n}")
                 print(f"Checkpointing at {i}")
 
         if i % 15 == 0:

From fff152a417b0bdb247098bc1539948ca4a53866b Mon Sep 17 00:00:00 2001
From: Skander Moalla <skander.moalla@epfl.ch>
Date: Sun, 22 Dec 2024 17:37:45 +0100
Subject: [PATCH 7/9] change logic

---
 src/template_package_name/template_experiment.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/template_package_name/template_experiment.py b/src/template_package_name/template_experiment.py
index c0f7837..c01a68e 100644
--- a/src/template_package_name/template_experiment.py
+++ b/src/template_package_name/template_experiment.py
@@ -77,7 +77,7 @@ def main(config: DictConfig) -> None:
 
     # Example experiment
     n = 100
-    # Loop from 0 to 99 and write 100 files to the disk.
+    # Loop from 0 to 27 and write 28 files to the disk.
 
     # Attempt to resume
     # Find the latest checkpoint of format file_{i}.txt
@@ -91,7 +91,7 @@ def main(config: DictConfig) -> None:
     else:
         j = -1
 
-    for i in range(j + 1, 100):
+    for i in range(j + 1, 28):
         wandb.log(
             {
                 "iteration": i,

From a3b3513cd306f3ae1d3371a162e4db201d6eba68 Mon Sep 17 00:00:00 2001
From: Skander Moalla <skander.moalla@epfl.ch>
Date: Sun, 22 Dec 2024 17:39:27 +0100
Subject: [PATCH 8/9] change logic

---
 src/template_package_name/template_experiment.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/template_package_name/template_experiment.py b/src/template_package_name/template_experiment.py
index c01a68e..6da3b18 100644
--- a/src/template_package_name/template_experiment.py
+++ b/src/template_package_name/template_experiment.py
@@ -77,7 +77,7 @@ def main(config: DictConfig) -> None:
 
     # Example experiment
     n = 100
-    # Loop from 0 to 27 and write 28 files to the disk.
+    # Loop from 1 to 27 and write 27 files to the disk.
 
     # Attempt to resume
     # Find the latest checkpoint of format file_{i}.txt
@@ -89,7 +89,7 @@ def main(config: DictConfig) -> None:
         logger.info(f"Resuming from {last_file}")
         j = int(last_file.stem.split("_")[1]) % (config.some_number * n)
     else:
-        j = -1
+        j = 0
 
     for i in range(j + 1, 28):
         wandb.log(

From 616d0d302ee420b9d1089bdd4fd46d7588c40822 Mon Sep 17 00:00:00 2001
From: Skander Moalla <skander.moalla@epfl.ch>
Date: Sun, 22 Dec 2024 17:41:07 +0100
Subject: [PATCH 9/9] change logic

---
 src/template_package_name/template_experiment.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/template_package_name/template_experiment.py b/src/template_package_name/template_experiment.py
index 6da3b18..b17357e 100644
--- a/src/template_package_name/template_experiment.py
+++ b/src/template_package_name/template_experiment.py
@@ -105,8 +105,8 @@ def main(config: DictConfig) -> None:
                 f.write(f"some_metric={i + config.some_number * n}")
                 print(f"Checkpointing at {i}")
 
-        if i % 15 == 0:
-            # To test resuming.
+        if j == 0 and i % 15 == 0:
+            # Crash at first run to test resuming.
             raise ValueError("Crashing at i % 15 = 0")
             pass
         sleep(1)