Skip to content

Commit da192cb

Browse files
justinvyuarvind-chandra
authored andcommitted
[train] New persistence mode: Tune tests + examples (small) (batch 1) (ray-project#38807)
Signed-off-by: Justin Yu <justinvyu@anyscale.com> Signed-off-by: e428265 <arvind.chandramouli@lmco.com>
1 parent 86abe7b commit da192cb

15 files changed

+238
-348
lines changed

python/ray/train/tests/test_training_iterator.py

+3-8
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,5 @@
11
import functools
22
import time
3-
import tempfile
43
from unittest.mock import patch
54
import pytest
65
from ray.train._internal.worker_group import WorkerGroup
@@ -12,7 +11,6 @@
1211
from ray.air._internal.util import StartTraceback
1312
from ray.train.backend import BackendConfig
1413
from ray.train._internal.session import init_session, get_session
15-
from ray.train._internal.storage import StorageContext
1614
from ray.train._internal.backend_executor import BackendExecutor
1715
from ray.train._internal.utils import construct_train_func
1816
from ray.train._internal.checkpoint import CheckpointManager
@@ -23,12 +21,13 @@
2321
train_func as linear_train_func,
2422
)
2523

24+
from ray.train.tests.util import mock_storage_context
25+
2626
MAX_RETRIES = 3
2727

2828

2929
@pytest.fixture(autouse=True, scope="module")
3030
def patch_tune_session():
31-
tempdir = tempfile.mkdtemp()
3231
if not get_session():
3332
init_session(
3433
training_func=None,
@@ -37,11 +36,7 @@ def patch_tune_session():
3736
node_rank=None,
3837
local_world_size=None,
3938
world_size=None,
40-
storage=StorageContext(
41-
storage_path=tempdir,
42-
experiment_dir_name="exp_name",
43-
trial_dir_name="trial_name",
44-
),
39+
storage=mock_storage_context(),
4540
)
4641
yield
4742

python/ray/train/tests/util.py

+10-4
Original file line numberDiff line numberDiff line change
@@ -26,8 +26,14 @@ def load_dict_checkpoint(checkpoint: Checkpoint) -> Dict[str, Any]:
2626

2727

2828
def mock_storage_context() -> StorageContext:
29-
return StorageContext(
30-
storage_path=tempfile.mkdtemp(),
31-
experiment_dir_name="exp_name",
32-
trial_dir_name="trial_name",
29+
storage_path = tempfile.mkdtemp()
30+
exp_name = "exp_name"
31+
trial_name = "trial_name"
32+
storage = StorageContext(
33+
storage_path=storage_path,
34+
experiment_dir_name=exp_name,
35+
trial_dir_name=trial_name,
3336
)
37+
storage.storage_local_path = storage_path
38+
os.makedirs(os.path.join(storage_path, exp_name, trial_name), exist_ok=True)
39+
return storage

python/ray/tune/BUILD

+9-18
Original file line numberDiff line numberDiff line change
@@ -50,7 +50,7 @@ py_test(
5050
name = "test_actor_reuse",
5151
size = "large",
5252
srcs = ["tests/test_actor_reuse.py"],
53-
tags = ["team:ml", "exclusive", "no_new_storage"],
53+
tags = ["team:ml", "exclusive", "new_storage"],
5454
deps = [":tune_lib", ":conftest"],
5555
)
5656

@@ -94,15 +94,6 @@ py_test(
9494
tags = ["team:ml", "exclusive", "rllib"],
9595
)
9696

97-
py_test(
98-
name = "test_cluster_searcher",
99-
size = "large",
100-
srcs = ["tests/test_cluster_searcher.py"],
101-
data = ["tests/_test_cluster_interrupt_searcher.py"],
102-
deps = [":tune_lib"],
103-
tags = ["team:ml", "exclusive", "no_new_storage"],
104-
)
105-
10697
py_test(
10798
name = "test_commands",
10899
size = "medium",
@@ -406,8 +397,8 @@ py_test(
406397
name = "test_tuner_restore",
407398
size = "large",
408399
srcs = ["tests/test_tuner_restore.py"],
409-
deps = [":tune_lib"],
410-
tags = ["team:ml", "exclusive", "no_new_storage"],
400+
deps = [":tune_lib", ":conftest"],
401+
tags = ["team:ml", "exclusive", "new_storage"],
411402
)
412403

413404
py_test(
@@ -453,15 +444,15 @@ py_test(
453444
size = "small",
454445
srcs = ["tests/execution/test_actor_caching.py"],
455446
deps = [":tune_lib"],
456-
tags = ["team:ml", "exclusive", "no_new_storage"]
447+
tags = ["team:ml", "exclusive", "new_storage"]
457448
)
458449

459450
py_test(
460451
name = "test_controller_callback_integration",
461452
size = "large",
462453
srcs = ["tests/execution/test_controller_callback_integration.py"],
463454
deps = [":tune_lib"],
464-
tags = ["team:ml", "exclusive", "no_new_storage"]
455+
tags = ["team:ml", "exclusive", "new_storage"]
465456
)
466457

467458
py_test(
@@ -477,31 +468,31 @@ py_test(
477468
size = "large",
478469
srcs = ["tests/execution/test_controller_control_integration.py"],
479470
deps = [":tune_lib"],
480-
tags = ["team:ml", "exclusive", "no_new_storage"]
471+
tags = ["team:ml", "exclusive", "new_storage"]
481472
)
482473

483474
py_test(
484475
name = "test_controller_errors_integration",
485476
size = "large",
486477
srcs = ["tests/execution/test_controller_errors_integration.py"],
487478
deps = [":tune_lib"],
488-
tags = ["team:ml", "exclusive", "no_new_storage"]
479+
tags = ["team:ml", "exclusive", "new_storage"]
489480
)
490481

491482
py_test(
492483
name = "test_controller_resources_integration",
493484
size = "large",
494485
srcs = ["tests/execution/test_controller_resources_integration.py"],
495486
deps = [":tune_lib"],
496-
tags = ["team:ml", "exclusive", "no_new_storage"]
487+
tags = ["team:ml", "exclusive", "new_storage"]
497488
)
498489

499490
py_test(
500491
name = "test_controller_search_alg_integration",
501492
size = "large",
502493
srcs = ["tests/execution/test_controller_search_alg_integration.py"],
503494
deps = [":tune_lib"],
504-
tags = ["team:ml", "exclusive", "no_new_storage"]
495+
tags = ["team:ml", "exclusive", "new_storage"]
505496
)
506497

507498
# --------------------------------------------------------------------

python/ray/tune/tests/_test_cluster_interrupt_searcher.py

-57
This file was deleted.

python/ray/tune/tests/execution/test_actor_caching.py

+1
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
import sys
33

44
from ray.tune import PlacementGroupFactory
5+
56
from ray.tune.tests.execution.utils import create_execution_test_objects, TestingTrial
67

78

python/ray/tune/tests/execution/test_controller_callback_integration.py

+6-9
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,8 @@
99
from ray.tune.execution.tune_controller import TuneController
1010
from ray.tune.experiment import Trial
1111

12+
from ray.train.tests.util import mock_storage_context
13+
1214

1315
@pytest.fixture(scope="function")
1416
def ray_start_4_cpus_2_gpus_extra():
@@ -43,21 +45,16 @@ def test_callback_save_restore(
4345
4446
Legacy test: test_trial_runner_3.py::TrialRunnerTest::testCallbackSaveRestore
4547
"""
46-
runner = TuneController(
47-
callbacks=[StatefulCallback()],
48-
experiment_path=str(tmpdir),
49-
)
50-
runner.add_trial(Trial("__fake", stub=True))
48+
storage = mock_storage_context()
49+
runner = TuneController(callbacks=[StatefulCallback()], storage=storage)
50+
runner.add_trial(Trial("__fake", stub=True, storage=storage))
5151
for i in range(3):
5252
runner._callbacks.on_trial_result(
5353
iteration=i, trials=None, trial=None, result=None
5454
)
5555
runner.checkpoint(force=True)
5656
callback = StatefulCallback()
57-
runner2 = TuneController(
58-
callbacks=[callback],
59-
experiment_path=str(tmpdir),
60-
)
57+
runner2 = TuneController(callbacks=[callback], storage=storage)
6158
assert callback.counter == 0
6259
runner2.resume()
6360
assert callback.counter == 3

python/ray/tune/tests/execution/test_controller_control_integration.py

+12-2
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,12 @@
1010
from ray.tune.experiment import Trial
1111

1212

13+
from ray.train.tests.util import mock_storage_context
14+
15+
16+
STORAGE = mock_storage_context()
17+
18+
1319
@pytest.fixture(scope="function")
1420
def ray_start_4_cpus_2_gpus_extra():
1521
address_info = ray.init(num_cpus=4, num_gpus=2, resources={"a": 2})
@@ -26,12 +32,13 @@ def test_stop_trial(ray_start_4_cpus_2_gpus_extra, resource_manager_cls):
2632
Legacy test: test_trial_runner_3.py::TrialRunnerTest::testStopTrial
2733
"""
2834
runner = TuneController(
29-
resource_manager_factory=lambda: resource_manager_cls(),
35+
resource_manager_factory=lambda: resource_manager_cls(), storage=STORAGE
3036
)
3137
kwargs = {
3238
"stopping_criterion": {"training_iteration": 10},
3339
"placement_group_factory": PlacementGroupFactory([{"CPU": 2, "GPU": 1}]),
3440
"config": {"sleep": 1},
41+
"storage": STORAGE,
3542
}
3643
trials = [
3744
Trial("__fake", **kwargs),
@@ -107,7 +114,9 @@ def test_remove_actor_tracking(ray_start_4_cpus_2_gpus_extra, resource_manager_c
107114
in ``self._stopping_trials``.
108115
"""
109116
runner = TuneController(
110-
resource_manager_factory=lambda: resource_manager_cls(), reuse_actors=True
117+
resource_manager_factory=lambda: resource_manager_cls(),
118+
reuse_actors=True,
119+
storage=STORAGE,
111120
)
112121

113122
def train(config):
@@ -117,6 +126,7 @@ def train(config):
117126

118127
kwargs = {
119128
"placement_group_factory": PlacementGroupFactory([{"CPU": 4, "GPU": 2}]),
129+
"storage": STORAGE,
120130
}
121131
trials = [Trial("test_remove_actor_tracking", **kwargs) for i in range(4)]
122132
for t in trials:

python/ray/tune/tests/execution/test_controller_errors_integration.py

+14-2
Original file line numberDiff line numberDiff line change
@@ -13,9 +13,14 @@
1313
from ray.tune.registry import TRAINABLE_CLASS, _global_registry
1414
from ray.tune.schedulers import FIFOScheduler
1515
from ray.tune.search import BasicVariantGenerator
16+
17+
from ray.train.tests.util import mock_storage_context
1618
from ray.tune.tests.execution.utils import BudgetResourceManager
1719

1820

21+
STORAGE = mock_storage_context()
22+
23+
1924
@pytest.fixture(scope="function")
2025
def ray_start_4_cpus_2_gpus_extra():
2126
address_info = ray.init(num_cpus=4, num_gpus=2, resources={"a": 2})
@@ -53,11 +58,12 @@ def test_invalid_trainable(ray_start_4_cpus_2_gpus_extra, resource_manager_cls):
5358
Legacy test: test_trial_runner_2.py::TrialRunnerTest::testErrorHandling
5459
"""
5560
runner = TuneController(
56-
resource_manager_factory=lambda: resource_manager_cls(),
61+
resource_manager_factory=lambda: resource_manager_cls(), storage=STORAGE
5762
)
5863
kwargs = {
5964
"stopping_criterion": {"training_iteration": 1},
6065
"placement_group_factory": PlacementGroupFactory([{"CPU": 1, "GPU": 1}]),
66+
"storage": STORAGE,
6167
}
6268
_global_registry.register(TRAINABLE_CLASS, "asdf", None)
6369
trials = [Trial("asdf", **kwargs), Trial("__fake", **kwargs)]
@@ -78,6 +84,7 @@ def test_overstep(ray_start_4_cpus_2_gpus_extra):
7884
os.environ["TUNE_MAX_PENDING_TRIALS_PG"] = "1"
7985
runner = TuneController(
8086
resource_manager_factory=lambda: BudgetResourceManager({"CPU": 4}),
87+
storage=STORAGE,
8188
)
8289
runner.step()
8390
with pytest.raises(TuneError):
@@ -106,13 +113,15 @@ def test_failure_recovery(
106113
search_alg=searchalg,
107114
scheduler=scheduler,
108115
resource_manager_factory=lambda: resource_manager_cls(),
116+
storage=STORAGE,
109117
)
110118
kwargs = {
111119
"placement_group_factory": PlacementGroupFactory([{"CPU": 1, "GPU": 1}]),
112120
"stopping_criterion": {"training_iteration": 2},
113121
"checkpoint_config": CheckpointConfig(checkpoint_frequency=1),
114122
"max_failures": max_failures,
115123
"config": {"mock_error": True, "persistent_error": persistent_error},
124+
"storage": STORAGE,
116125
}
117126
runner.add_trial(Trial("__fake", **kwargs))
118127
trials = runner.get_trials()
@@ -155,7 +164,9 @@ def test_fail_fast(ray_start_4_cpus_2_gpus_extra, resource_manager_cls, fail_fas
155164
"""
156165

157166
runner = TuneController(
158-
resource_manager_factory=lambda: resource_manager_cls(), fail_fast=fail_fast
167+
resource_manager_factory=lambda: resource_manager_cls(),
168+
fail_fast=fail_fast,
169+
storage=STORAGE,
159170
)
160171
kwargs = {
161172
"placement_group_factory": PlacementGroupFactory([{"CPU": 1, "GPU": 1}]),
@@ -165,6 +176,7 @@ def test_fail_fast(ray_start_4_cpus_2_gpus_extra, resource_manager_cls, fail_fas
165176
"mock_error": True,
166177
"persistent_error": True,
167178
},
179+
"storage": STORAGE,
168180
}
169181
runner.add_trial(Trial("__fake", **kwargs))
170182
runner.add_trial(Trial("__fake", **kwargs))

0 commit comments

Comments
 (0)