[train] New persistence mode: Sanity-check release test (ray-project#39354)

justinvyu · matthewdeng · web-flow · commit eccc1920eb77 · 2023-09-08T21:48:14.000-07:00
Signed-off-by: Justin Yu &lt;justinvyu@anyscale.com&gt;
Co-authored-by: matthewdeng &lt;matt@anyscale.com&gt;
diff --git a/python/ray/train/tests/test_new_persistence.py b/python/ray/train/tests/test_new_persistence.py
@@ -1,4 +1,5 @@
 from contextlib import contextmanager
+import logging
 import os
 from pathlib import Path
 import pickle
@@ -13,26 +14,53 @@
 
 import ray
 from ray import train, tune
+from ray._private.test_utils import simulate_storage
 from ray.air._internal.uri_utils import URI
 from ray.air.constants import EXPR_RESULT_FILE
-from ray.train._internal.storage import _download_from_fs_path, StorageContext
+from ray.train._internal.storage import (
+    _delete_fs_path,
+    _download_from_fs_path,
+    StorageContext,
+)
 from ray.train._checkpoint import Checkpoint
 from ray.train.base_trainer import TrainingFailedError
 from ray.train.constants import RAY_AIR_NEW_PERSISTENCE_MODE
 from ray.train.data_parallel_trainer import DataParallelTrainer
 from ray.tune.trainable.trainable import _DICT_CHECKPOINT_FILE_NAME
 
-from ray.train.tests.util import mock_s3_bucket_uri
 
+class TestConstants:
+    NUM_ITERATIONS = 6  # == num_checkpoints == num_artifacts
+    NUM_TRIALS = 2
+    NUM_WORKERS = 3
 
-_SCORE_KEY = "score"
-NUM_ITERATIONS = 6  # == num_checkpoints == num_artifacts
-NUM_TRIALS = 2
-NUM_WORKERS = 3
+    SCORE_KEY = "score"
 
 
 @contextmanager
-def dummy_context_manager():
+def mock_s3_bucket_uri():
+    port = 5002
+    region = "us-west-2"
+    with simulate_storage("s3", port=port, region=region) as s3_uri:
+        import boto3
+
+        s3 = boto3.client(
+            "s3", region_name=region, endpoint_url=f"http://localhost:{port}"
+        )
+        # Bucket name will be autogenerated/unique per test
+        bucket_name = URI(s3_uri).name
+        s3.create_bucket(
+            Bucket=bucket_name,
+            CreateBucketConfiguration={"LocationConstraint": region},
+        )
+        # Disable server HTTP request logging
+        logging.getLogger("werkzeug").setLevel(logging.WARNING)
+        yield URI(s3_uri)
+        logging.getLogger("werkzeug").setLevel(logging.INFO)
+
+
+@contextmanager
+def dummy_context_manager(*args, **kwargs):
     yield "dummy value"
 
 
@@ -164,16 +192,20 @@ def train_fn(config):
 
     checkpoint = train.get_checkpoint()
     if checkpoint:
-        with checkpoint.as_directory() as checkpoint_dir:
-            with open(os.path.join(checkpoint_dir, "checkpoint.pkl"), "rb") as f:
-                state = pickle.load(f)
+        custom_restore_fn = config.get("custom_restore_fn")
+        if custom_restore_fn:
+            state = custom_restore_fn(checkpoint)
+        else:
+            with checkpoint.as_directory() as checkpoint_dir:
+                with open(os.path.join(checkpoint_dir, "checkpoint.pkl"), "rb") as f:
+                    state = pickle.load(f)
         print("Loaded back state from checkpoint:", state)
         start = state["iter"] + 1
 
     for i in range(start, config.get("num_iterations", 5)):
-        time.sleep(0.25)
+        time.sleep(config.get("time_per_iter", 0.25))
 
-        metrics = {"iter": i, _SCORE_KEY: i}
+        metrics = {"iter": i, TestConstants.SCORE_KEY: i}
 
         # Save an artifact in the local trial dir.
         rank = train.get_context().get_world_rank()
@@ -199,7 +231,10 @@ def train_fn(config):
                     with open(os.path.join(temp_dir, checkpoint_file_name), "wb") as f:
                         pickle.dump({"iter": i}, f)
 
-                train.report(metrics, checkpoint=Checkpoint.from_directory(temp_dir))
+                with config.get("custom_save_fn", dummy_context_manager)(temp_dir):
+                    train.report(
+                        metrics, checkpoint=Checkpoint.from_directory(temp_dir)
+                    )
                 # `train.report` should not have deleted this!
                 assert os.path.exists(temp_dir)
 
@@ -260,7 +295,12 @@ def load_checkpoint(self, checkpoint_dict_or_path):
             ).read_text() == "dummy"
 
 
-def _resume_from_checkpoint(checkpoint: Checkpoint, expected_state: dict):
+def _resume_from_checkpoint(
+    checkpoint: Checkpoint,
+    expected_state: dict,
+    storage_path: Optional[str] = None,
+    storage_filesystem: Optional[pyarrow.fs.FileSystem] = None,
+):
     print(f"\nStarting run with `resume_from_checkpoint`: {checkpoint}\n")
 
     def assert_fn(config):
@@ -281,7 +321,11 @@ def assert_fn(config):
     trainer = DataParallelTrainer(
         assert_fn,
         scaling_config=train.ScalingConfig(num_workers=2),
-        run_config=train.RunConfig(name="test_resume_from_checkpoint"),
+        run_config=train.RunConfig(
+            name="test_resume_from_checkpoint",
+            storage_path=storage_path,
+            storage_filesystem=storage_filesystem,
+        ),
         resume_from_checkpoint=checkpoint,
     )
     result = trainer.fit()
@@ -291,6 +335,9 @@ def assert_fn(config):
         result.checkpoint.path
     ).name == StorageContext._make_checkpoint_dir_name(0)
 
+    # Clean up this run's experiment directory immediately after.
+    _delete_fs_path(result.filesystem, Path(result.path).parent.as_posix())
+
 
 def _assert_storage_contents(
     local_inspect_dir: Path,
@@ -299,7 +346,10 @@ def _assert_storage_contents(
     trainable_name: str,
     test_trainer: bool,
     no_checkpoint_ranks: List[int] = None,
+    constants: type = TestConstants,
 ):
+    no_checkpoint_ranks = no_checkpoint_ranks or []
+
     # Second, inspect the contents of the storage path
     storage_path_ls = list(local_inspect_dir.glob("*"))
     assert len(storage_path_ls) == 1  # Only expect 1 experiment dir
@@ -319,11 +369,13 @@ def _assert_storage_contents(
     assert (
         len(list(exp_dir.glob(f"{trainable_name}*"))) == 1
         if test_trainer
-        else NUM_TRIALS
+        else constants.NUM_TRIALS
     )
     for trial_dir in exp_dir.glob(f"{trainable_name}*"):
         # If set, expect num_to_keep. Otherwise, expect to see all of them.
-        expected_num_checkpoints = checkpoint_config.num_to_keep or NUM_ITERATIONS
+        expected_num_checkpoints = (
+            checkpoint_config.num_to_keep or constants.NUM_ITERATIONS
+        )
 
         assert len(list(trial_dir.glob("checkpoint_*"))) == expected_num_checkpoints
         checkpoint_idxs = sorted(
@@ -335,7 +387,10 @@ def _assert_storage_contents(
         # Ex: If num_to_keep=2 out of 6 total checkpoints,
         # expect checkpoint_004 and checkpoint_005.
         assert checkpoint_idxs == list(
-            range(NUM_ITERATIONS - expected_num_checkpoints, NUM_ITERATIONS)
+            range(
+                constants.NUM_ITERATIONS - expected_num_checkpoints,
+                constants.NUM_ITERATIONS,
+            )
         )
 
         for checkpoint_dir in trial_dir.glob("checkpoint_*"):
@@ -353,12 +408,16 @@ def _assert_storage_contents(
                     for checkpoint_shard in checkpoint_dir.glob(
                         "checkpoint_shard-*.pkl"
                     )
-                } == {i for i in range(NUM_WORKERS) if i not in no_checkpoint_ranks}
+                } == {
+                    i
+                    for i in range(constants.NUM_WORKERS)
+                    if i not in no_checkpoint_ranks
+                }
 
         if test_trainer:
-            expected_num_artifacts = NUM_ITERATIONS * NUM_WORKERS
+            expected_num_artifacts = constants.NUM_ITERATIONS * constants.NUM_WORKERS
         else:
-            expected_num_artifacts = NUM_ITERATIONS
+            expected_num_artifacts = constants.NUM_ITERATIONS
         assert len(list(trial_dir.glob("artifact-*"))) == expected_num_artifacts
 
         # NOTE: This result file is synced by the driver.
@@ -419,7 +478,7 @@ def test_tuner(
         tuner = tune.Tuner(
             trainable,
             param_space={
-                "num_iterations": NUM_ITERATIONS,
+                "num_iterations": TestConstants.NUM_ITERATIONS,
                 "fail_iters": [2, 4],
                 # NOTE: This param is only used in the ClassTrainable.
                 "save_checkpoint_as_dict": tune.grid_search([True, False]),
@@ -464,7 +523,7 @@ def test_tuner(
     experiment_fs_path = result_grid.experiment_path
     assert isinstance(result_grid.filesystem, pyarrow.fs.FileSystem), result_grid
     assert experiment_fs_path == os.path.join(storage_fs_path, exp_name)
-    assert len(result_grid) == NUM_TRIALS
+    assert len(result_grid) == TestConstants.NUM_TRIALS
     for result in result_grid:
         trial_fs_path = result.path
         assert isinstance(result.filesystem, pyarrow.fs.FileSystem), result
@@ -489,7 +548,7 @@ def test_tuner(
         train.CheckpointConfig(),
         train.CheckpointConfig(
             num_to_keep=1,
-            checkpoint_score_attribute=_SCORE_KEY,
+            checkpoint_score_attribute=TestConstants.SCORE_KEY,
             checkpoint_score_order="max",
         ),
     ],
@@ -538,14 +597,14 @@ def test_trainer(
             train_fn,
             train_loop_config={
                 "in_trainer": True,
-                "num_iterations": NUM_ITERATIONS,
+                "num_iterations": TestConstants.NUM_ITERATIONS,
                 "fail_iters": [2, 4],
                 # TODO(justinvyu): This should be separated into its own test once
                 # CI has been fully migrated.
                 # Test that global rank 0 is not required to checkpoint.
                 "no_checkpoint_ranks": no_checkpoint_ranks,
             },
-            scaling_config=train.ScalingConfig(num_workers=NUM_WORKERS),
+            scaling_config=train.ScalingConfig(num_workers=TestConstants.NUM_WORKERS),
             run_config=train.RunConfig(
                 storage_path=storage_path,
                 storage_filesystem=storage_filesystem,
@@ -574,7 +633,8 @@ def test_trainer(
                 "RAY_AIR_LOCAL_CACHE_DIR", str(tmp_path / "resume_from_checkpoint")
             )
             _resume_from_checkpoint(
-                result.checkpoint, expected_state={"iter": NUM_ITERATIONS - 1}
+                result.checkpoint,
+                expected_state={"iter": TestConstants.NUM_ITERATIONS - 1},
             )
 
         local_inspect_dir, storage_fs_path = _get_local_inspect_dir(
diff --git a/python/ray/train/tests/util.py b/python/ray/train/tests/util.py
@@ -1,15 +1,11 @@
 import contextlib
-import logging
 import os
 import tempfile
-from contextlib import contextmanager
 from typing import Any, Dict, Type
 
 import ray.cloudpickle as ray_pickle
 from ray.train import Checkpoint
 from ray.train._internal.storage import StorageContext
-from ray._private.test_utils import simulate_storage
-from ray.air._internal.uri_utils import URI
 
 
 @contextlib.contextmanager
@@ -41,25 +37,3 @@ def mock_storage_context() -> StorageContext:
     storage.storage_local_path = storage_path
     os.makedirs(os.path.join(storage_path, exp_name, trial_name), exist_ok=True)
     return storage
-
-
-@contextmanager
-def mock_s3_bucket_uri():
-    port = 5002
-    region = "us-west-2"
-    with simulate_storage("s3", port=port, region=region) as s3_uri:
-        import boto3
-
-        s3 = boto3.client(
-            "s3", region_name=region, endpoint_url=f"http://localhost:{port}"
-        )
-        # Bucket name will be autogenerated/unique per test
-        bucket_name = URI(s3_uri).name
-        s3.create_bucket(
-            Bucket=bucket_name,
-            CreateBucketConfiguration={"LocationConstraint": region},
-        )
-        # Disable server HTTP request logging
-        logging.getLogger("werkzeug").setLevel(logging.WARNING)
-        yield URI(s3_uri)
-        logging.getLogger("werkzeug").setLevel(logging.INFO)
diff --git a/python/ray/tune/tests/test_experiment_analysis.py b/python/ray/tune/tests/test_experiment_analysis.py
@@ -17,11 +17,8 @@
 from ray.tune.experiment import Trial
 from ray.tune.utils import flatten_dict
 
-from ray.train.tests.util import (
-    create_dict_checkpoint,
-    load_dict_checkpoint,
-    mock_s3_bucket_uri,
-)
+from ray.train.tests.util import create_dict_checkpoint, load_dict_checkpoint
+from ray.train.tests.test_new_persistence import mock_s3_bucket_uri
 
 
 NUM_TRIALS = 3
diff --git a/release/ray_release/byod/byod_train_persistence_test.sh b/release/ray_release/byod/byod_train_persistence_test.sh
@@ -0,0 +1,7 @@
+#!/bin/bash
+# This script is used to build an extra layer on top of the base anyscale/ray image
+# to run the train_multinode_persistence test.
+
+set -exo pipefail
+
+pip3 install -U torch fsspec s3fs gcsfs pyarrow>=9.0.0 pytest
diff --git a/release/release_tests.yaml b/release/release_tests.yaml
@@ -3383,6 +3383,37 @@
 
   alert: default
 
+
+- name: train_multinode_persistence
+  group: Train tests
+  working_dir: train_tests/e2e
+
+  frequency: nightly
+  team: ml
+
+  cluster:
+    byod:
+      post_build_script: byod_train_persistence_test.sh
+    cluster_compute: compute_aws.yaml
+
+  run:
+    timeout: 3000
+    script: pytest -v test_persistence.py -s
+
+    wait_for_nodes:
+      num_nodes: 4
+
+  variations:
+    - __suffix__: aws
+    - __suffix__: gce
+      env: gce
+      frequency: manual
+      cluster:
+        cluster_compute: compute_gce.yaml
+
+  alert: default
+
+
 ########################
 # Alpa tests
 ########################
diff --git a/release/train_tests/e2e/compute_aws.yaml b/release/train_tests/e2e/compute_aws.yaml
@@ -0,0 +1,22 @@
+cloud_id: {{env["ANYSCALE_CLOUD_ID"]}}
+region: us-west-2
+
+max_workers: 3
+
+head_node_type:
+    name: head_node
+    instance_type: m5.2xlarge
+
+worker_node_types:
+    - name: worker_node
+      instance_type: m5.2xlarge
+      max_workers: 3
+      min_workers: 3
+      use_spot: false
+
+aws:
+  TagSpecifications:
+    - ResourceType: "instance"
+      Tags:
+        - Key: ttl-hours
+          Value: '24'
diff --git a/release/train_tests/e2e/compute_gce.yaml b/release/train_tests/e2e/compute_gce.yaml
@@ -0,0 +1,17 @@
+cloud_id: {{env["ANYSCALE_CLOUD_ID"]}}
+region: us-west1
+allowed_azs:
+    - us-west1-b
+
+max_workers: 3
+
+head_node_type:
+    name: head_node
+    instance_type: n2-standard-8
+
+worker_node_types:
+    - name: worker_node
+      instance_type: n2-standard-8
+      max_workers: 3
+      min_workers: 3
+      use_spot: false
diff --git a/release/train_tests/e2e/test_new_persistence.py b/release/train_tests/e2e/test_new_persistence.py
diff --git a/release/train_tests/e2e/test_persistence.py b/release/train_tests/e2e/test_persistence.py