From 152eb57defc37ca09478a344a25777dd164a7452 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Carlos=20Mochol=C3=AD?= Date: Fri, 26 Nov 2021 18:13:14 +0100 Subject: [PATCH] Rename special to standalone (#10779) --- .azure-pipelines/gpu-tests.yml | 4 +- .../test_accelerator_connector.py | 2 +- tests/accelerators/test_ddp.py | 2 +- tests/accelerators/test_multi_nodes_gpu.py | 4 +- tests/callbacks/test_pruning.py | 2 +- tests/callbacks/test_stochastic_weight_avg.py | 2 +- tests/callbacks/test_tqdm_progress_bar.py | 2 +- .../test_checkpoint_callback_frequency.py | 2 +- tests/conftest.py | 8 +-- tests/core/test_metric_result_integration.py | 2 +- tests/helpers/runif.py | 12 ++-- tests/lite/test_lite.py | 2 +- tests/lite/test_parity.py | 2 +- tests/models/test_hooks.py | 4 +- tests/models/test_sync_batchnorm.py | 2 +- .../environments/torch_elastic_deadlock.py | 2 +- tests/plugins/test_amp_plugins.py | 2 +- ..._ddp_fully_sharded_with_full_state_dict.py | 6 +- tests/plugins/test_ddp_plugin.py | 4 +- .../plugins/test_ddp_plugin_with_comm_hook.py | 10 ++-- tests/plugins/test_deepspeed_plugin.py | 58 +++++++++---------- tests/plugins/test_sharded_plugin.py | 6 +- tests/profiler/test_profiler.py | 6 +- .../{special_tests.sh => standalone_tests.sh} | 14 ++--- .../logging_/test_train_loop_logging.py | 2 +- .../optimization/test_manual_optimization.py | 4 +- tests/trainer/optimization/test_optimizers.py | 2 +- tests/trainer/test_trainer.py | 6 +- tests/utilities/test_all_gather_grad.py | 4 +- .../test_deepspeed_collate_checkpoint.py | 2 +- tests/utilities/test_meta.py | 2 +- tests/utilities/test_warnings.py | 4 +- 32 files changed, 93 insertions(+), 93 deletions(-) rename tests/{special_tests.sh => standalone_tests.sh} (82%) diff --git a/.azure-pipelines/gpu-tests.yml b/.azure-pipelines/gpu-tests.yml index 71332a840fdb0..8752e8584439a 100644 --- a/.azure-pipelines/gpu-tests.yml +++ b/.azure-pipelines/gpu-tests.yml @@ -72,10 +72,10 @@ jobs: displayName: 'Testing: standard' - bash: | - bash tests/special_tests.sh + bash tests/standalone_tests.sh env: PL_USE_MOCKED_MNIST: "1" - displayName: 'Testing: special' + displayName: 'Testing: standalone' - bash: | python -m coverage report diff --git a/tests/accelerators/test_accelerator_connector.py b/tests/accelerators/test_accelerator_connector.py index c95c7dc517ef0..51316c155368c 100644 --- a/tests/accelerators/test_accelerator_connector.py +++ b/tests/accelerators/test_accelerator_connector.py @@ -337,7 +337,7 @@ def on_fit_start(self, trainer, pl_module): trainer.fit(model) -@RunIf(skip_windows=True, special=True) +@RunIf(skip_windows=True, standalone=True) def test_accelerator_choice_ddp_cpu_and_strategy(tmpdir): """Test that accelerator="ddp_cpu" can work together with an instance of DDPPlugin.""" _test_accelerator_choice_ddp_cpu_and_strategy(tmpdir, ddp_strategy_class=DDPPlugin) diff --git a/tests/accelerators/test_ddp.py b/tests/accelerators/test_ddp.py index 1982e967c21ea..db2f388971c12 100644 --- a/tests/accelerators/test_ddp.py +++ b/tests/accelerators/test_ddp.py @@ -108,7 +108,7 @@ def setup(self, stage: Optional[str] = None) -> None: trainer.fit(model) -@RunIf(min_gpus=2, min_torch="1.8.1", special=True) +@RunIf(min_gpus=2, min_torch="1.8.1", standalone=True) @pytest.mark.parametrize("precision", (16, 32)) def test_ddp_wrapper(tmpdir, precision): """Test parameters to ignore are carried over for DDP.""" diff --git a/tests/accelerators/test_multi_nodes_gpu.py b/tests/accelerators/test_multi_nodes_gpu.py index 0df49a41b0fd0..09f632746b1dd 100644 --- a/tests/accelerators/test_multi_nodes_gpu.py +++ b/tests/accelerators/test_multi_nodes_gpu.py @@ -31,7 +31,7 @@ # TODO(Borda): When multi-node tests are re-enabled (.github/workflows/ci_test-mnodes.yml) # use an environment variable `PL_RUNNING_MULTINODE_TESTS` and set `RunIf(multinode=True)` @pytest.mark.skip("Multi-node testing is currently disabled") -@RunIf(special=True) +@RunIf(standalone=True) def test_logging_sync_dist_true_ddp(tmpdir): """Tests to ensure that the sync_dist flag works with CPU (should just return the original value)""" fake_result = 1 @@ -68,7 +68,7 @@ def validation_step(self, batch, batch_idx): # TODO(Borda): When multi-node tests are re-enabled (.github/workflows/ci_test-mnodes.yml) # use an environment variable `PL_RUNNING_MULTINODE_TESTS` and set `RunIf(multinode=True)` @pytest.mark.skip("Multi-node testing is currently disabled") -@RunIf(special=True) +@RunIf(standalone=True) def test__validation_step__log(tmpdir): """Tests that validation_step can log.""" diff --git a/tests/callbacks/test_pruning.py b/tests/callbacks/test_pruning.py index ec4dcddf777c0..f63892df94310 100644 --- a/tests/callbacks/test_pruning.py +++ b/tests/callbacks/test_pruning.py @@ -160,7 +160,7 @@ def test_pruning_callback( ) -@RunIf(special=True, min_gpus=2) +@RunIf(standalone=True, min_gpus=2) @pytest.mark.parametrize("parameters_to_prune", (False, True)) @pytest.mark.parametrize("use_global_unstructured", (False, True)) def test_pruning_callback_ddp(tmpdir, parameters_to_prune, use_global_unstructured): diff --git a/tests/callbacks/test_stochastic_weight_avg.py b/tests/callbacks/test_stochastic_weight_avg.py index d30edb177ed10..584e24bb71ed9 100644 --- a/tests/callbacks/test_stochastic_weight_avg.py +++ b/tests/callbacks/test_stochastic_weight_avg.py @@ -138,7 +138,7 @@ def train_with_swa( assert trainer.lightning_module == model -@RunIf(min_gpus=2, special=True) +@RunIf(min_gpus=2, standalone=True) def test_swa_callback_ddp(tmpdir): train_with_swa(tmpdir, strategy="ddp", gpus=2) diff --git a/tests/callbacks/test_tqdm_progress_bar.py b/tests/callbacks/test_tqdm_progress_bar.py index 1ff1a602fe3b6..ba66ad169f473 100644 --- a/tests/callbacks/test_tqdm_progress_bar.py +++ b/tests/callbacks/test_tqdm_progress_bar.py @@ -512,7 +512,7 @@ def test_tqdm_progress_bar_can_be_pickled(): pickle.dumps(bar) -@RunIf(min_gpus=2, special=True) +@RunIf(min_gpus=2, standalone=True) @pytest.mark.parametrize( ["total_train_samples", "train_batch_size", "total_val_samples", "val_batch_size", "val_check_interval"], [(8, 4, 2, 1, 0.2), (8, 4, 2, 1, 0.5)], diff --git a/tests/checkpointing/test_checkpoint_callback_frequency.py b/tests/checkpointing/test_checkpoint_callback_frequency.py index fd5c76b2faef7..2c14c7de29b9c 100644 --- a/tests/checkpointing/test_checkpoint_callback_frequency.py +++ b/tests/checkpointing/test_checkpoint_callback_frequency.py @@ -87,7 +87,7 @@ def training_step(self, batch, batch_idx): @mock.patch("torch.save") -@RunIf(special=True, min_gpus=2) +@RunIf(standalone=True, min_gpus=2) @pytest.mark.parametrize(["k", "epochs", "val_check_interval", "expected"], [(1, 1, 1.0, 1), (2, 2, 0.3, 4)]) def test_top_k_ddp(save_mock, tmpdir, k, epochs, val_check_interval, expected): class TestModel(BoringModel): diff --git a/tests/conftest.py b/tests/conftest.py index b001894f97918..176cc4342ee17 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -172,13 +172,13 @@ def single_process_pg(): def pytest_collection_modifyitems(items): - if os.getenv("PL_RUNNING_SPECIAL_TESTS", "0") != "1": + if os.getenv("PL_RUN_STANDALONE_TESTS", "0") != "1": return - # filter out non-special tests + # filter out non-standalone tests items[:] = [ item for item in items for marker in item.own_markers - # has `@RunIf(special=True)` - if marker.name == "skipif" and marker.kwargs.get("special") + # has `@RunIf(standalone=True)` + if marker.name == "skipif" and marker.kwargs.get("standalone") ] diff --git a/tests/core/test_metric_result_integration.py b/tests/core/test_metric_result_integration.py index 0d2e2a261e775..e506fc2927f7e 100644 --- a/tests/core/test_metric_result_integration.py +++ b/tests/core/test_metric_result_integration.py @@ -480,7 +480,7 @@ def test_result_collection_reload_1_gpu_ddp(tmpdir): result_collection_reload(default_root_dir=tmpdir, strategy="ddp", gpus=1) -@RunIf(min_gpus=2, special=True) +@RunIf(min_gpus=2, standalone=True) @mock.patch.dict(os.environ, {"PL_FAULT_TOLERANT_TRAINING": "1"}) def test_result_collection_reload_2_gpus(tmpdir): result_collection_reload(default_root_dir=tmpdir, strategy="ddp", gpus=2) diff --git a/tests/helpers/runif.py b/tests/helpers/runif.py index 5cdf422cf4fdb..4ad6942aa160a 100644 --- a/tests/helpers/runif.py +++ b/tests/helpers/runif.py @@ -65,7 +65,7 @@ def __new__( horovod: bool = False, horovod_nccl: bool = False, skip_windows: bool = False, - special: bool = False, + standalone: bool = False, fairscale: bool = False, fairscale_fully_sharded: bool = False, deepspeed: bool = False, @@ -87,7 +87,7 @@ def __new__( horovod: if Horovod is installed horovod_nccl: if Horovod is installed with NCCL support skip_windows: skip test for Windows platform (typically for some limited torch functionality) - special: running in special mode, outside pytest suit + standalone: Mark the test as standalone, our CI will run it in a separate process. fairscale: if `fairscale` module is required to run the test fairscale_fully_sharded: if `fairscale` fully sharded module is required to run the test deepspeed: if `deepspeed` module is required to run the test @@ -146,12 +146,12 @@ def __new__( conditions.append(not _HOROVOD_NCCL_AVAILABLE) reasons.append("Horovod with NCCL") - if special: - env_flag = os.getenv("PL_RUNNING_SPECIAL_TESTS", "0") + if standalone: + env_flag = os.getenv("PL_RUN_STANDALONE_TESTS", "0") conditions.append(env_flag != "1") - reasons.append("Special execution") + reasons.append("Standalone execution") # used in tests/conftest.py::pytest_collection_modifyitems - kwargs["special"] = True + kwargs["standalone"] = True if fairscale: conditions.append(not _FAIRSCALE_AVAILABLE) diff --git a/tests/lite/test_lite.py b/tests/lite/test_lite.py index 663001d08df54..1e8bf40e83319 100644 --- a/tests/lite/test_lite.py +++ b/tests/lite/test_lite.py @@ -380,7 +380,7 @@ def test_autocast(): lite._precision_plugin.forward_context().__exit__.assert_called() -@RunIf(min_gpus=2, deepspeed=True, special=True) +@RunIf(min_gpus=2, deepspeed=True, standalone=True) def test_deepspeed_multiple_models(): class Lite(LightningLite): def run(self): diff --git a/tests/lite/test_parity.py b/tests/lite/test_parity.py index bec9339ec8e2f..d4d0ca6e5e9c7 100644 --- a/tests/lite/test_parity.py +++ b/tests/lite/test_parity.py @@ -190,7 +190,7 @@ def test_boring_lite_model_ddp_spawn(precision, strategy, devices, accelerator, assert torch.equal(w_pure.cpu(), w_lite.cpu()) -@RunIf(min_gpus=2, special=True) +@RunIf(min_gpus=2, standalone=True) @pytest.mark.parametrize( "precision, strategy, devices, accelerator", [ diff --git a/tests/models/test_hooks.py b/tests/models/test_hooks.py index 35b50acfcef4f..e8db816ed4edc 100644 --- a/tests/models/test_hooks.py +++ b/tests/models/test_hooks.py @@ -167,7 +167,7 @@ def transfer_batch_to_device(self, batch, device, dataloader_idx): assert torch.allclose(batch_gpu.targets.cpu(), torch.ones(5, 1, dtype=torch.long) * 2) -@RunIf(min_gpus=2, special=True) +@RunIf(min_gpus=2, standalone=True) def test_transfer_batch_hook_ddp(tmpdir): """Test custom data are properly moved to the right device using ddp.""" @@ -426,7 +426,7 @@ def _predict_batch(trainer, model, batches): return out -@RunIf(deepspeed=True, min_gpus=1, special=True) +@RunIf(deepspeed=True, min_gpus=1, standalone=True) @pytest.mark.parametrize("automatic_optimization", (True, False)) def test_trainer_model_hook_system_fit_deepspeed(tmpdir, automatic_optimization): _run_trainer_model_hook_system_fit( diff --git a/tests/models/test_sync_batchnorm.py b/tests/models/test_sync_batchnorm.py index 67880bec4e474..86c4a5af68b91 100644 --- a/tests/models/test_sync_batchnorm.py +++ b/tests/models/test_sync_batchnorm.py @@ -67,7 +67,7 @@ def configure_optimizers(self): # TODO: Fatal Python error: Bus error @pytest.mark.skip(reason="Fatal Python error: Bus error") -@RunIf(min_gpus=2, special=True) +@RunIf(min_gpus=2, standalone=True) def test_sync_batchnorm_ddp(tmpdir): seed_everything(234) set_random_main_port() diff --git a/tests/plugins/environments/torch_elastic_deadlock.py b/tests/plugins/environments/torch_elastic_deadlock.py index ead433200c304..f8a64ba632991 100644 --- a/tests/plugins/environments/torch_elastic_deadlock.py +++ b/tests/plugins/environments/torch_elastic_deadlock.py @@ -7,7 +7,7 @@ from pytorch_lightning.utilities.exceptions import DeadlockDetectedException from tests.helpers.boring_model import BoringModel -if os.getenv("PL_RUNNING_SPECIAL_TESTS", "0") == "1" and os.getenv("PL_RECONCILE_PROCESS", "0") == "1": +if os.getenv("PL_RUN_STANDALONE_TESTS", "0") == "1" and os.getenv("PL_RECONCILE_PROCESS", "0") == "1": class CustomException(Exception): pass diff --git a/tests/plugins/test_amp_plugins.py b/tests/plugins/test_amp_plugins.py index 8f563f0e410e2..24c04de6604ef 100644 --- a/tests/plugins/test_amp_plugins.py +++ b/tests/plugins/test_amp_plugins.py @@ -190,7 +190,7 @@ def configure_optimizers(self): trainer.fit(model) -@RunIf(min_gpus=2, amp_apex=True, special=True) +@RunIf(min_gpus=2, amp_apex=True, standalone=True) @pytest.mark.parametrize("amp_level", ["O2"]) def test_amp_apex_ddp_fit(amp_level, tmpdir): class CustomBoringModel(BoringModel): diff --git a/tests/plugins/test_ddp_fully_sharded_with_full_state_dict.py b/tests/plugins/test_ddp_fully_sharded_with_full_state_dict.py index c0fab297173e7..6967ea9a12bd7 100644 --- a/tests/plugins/test_ddp_fully_sharded_with_full_state_dict.py +++ b/tests/plugins/test_ddp_fully_sharded_with_full_state_dict.py @@ -89,7 +89,7 @@ def _assert_layer_fsdp_instance(self) -> None: assert self.layer.module[2].reshard_after_forward is True -@RunIf(min_gpus=1, skip_windows=True, fairscale_fully_sharded=True, special=True) +@RunIf(min_gpus=1, skip_windows=True, fairscale_fully_sharded=True, standalone=True) def test_fully_sharded_plugin_checkpoint(tmpdir): """Test to ensure that checkpoint is saved correctly when using a single GPU, and all stages can be run.""" @@ -98,7 +98,7 @@ def test_fully_sharded_plugin_checkpoint(tmpdir): _run_multiple_stages(trainer, model, os.path.join(tmpdir, "last.ckpt")) -@RunIf(min_gpus=2, skip_windows=True, fairscale_fully_sharded=True, special=True) +@RunIf(min_gpus=2, skip_windows=True, fairscale_fully_sharded=True, standalone=True) def test_fully_sharded_plugin_checkpoint_multi_gpus(tmpdir): """Test to ensure that checkpoint is saved correctly when using multiple GPUs, and all stages can be run.""" @@ -136,7 +136,7 @@ def _run_multiple_stages(trainer, model, model_path: Optional[str] = None): trainer.test(ckpt_path=model_path) -@RunIf(min_gpus=1, skip_windows=True, fairscale_fully_sharded=True, special=True) +@RunIf(min_gpus=1, skip_windows=True, fairscale_fully_sharded=True, standalone=True) def test_fsdp_gradient_clipping_raises(tmpdir): """Test to ensure that an exception is raised when clipping gradients by value with FSDP.""" model = BoringModel() diff --git a/tests/plugins/test_ddp_plugin.py b/tests/plugins/test_ddp_plugin.py index 78ae931330307..1aaf89d052686 100644 --- a/tests/plugins/test_ddp_plugin.py +++ b/tests/plugins/test_ddp_plugin.py @@ -33,7 +33,7 @@ def on_train_start(self) -> None: self.start_cuda_memory = torch.cuda.memory_allocated() -@RunIf(skip_windows=True, min_gpus=2, special=True) +@RunIf(skip_windows=True, min_gpus=2, standalone=True) def test_ddp_with_2_gpus(): """Tests if device is set correctely when training and after teardown for DDPPlugin.""" trainer = Trainer(gpus=2, strategy="ddp", fast_dev_run=True) @@ -64,7 +64,7 @@ def on_train_start(self): self.trainer.training_type_plugin.barrier("barrier after model is wrapped") -@RunIf(min_gpus=4, special=True) +@RunIf(min_gpus=4, standalone=True) @mock.patch("torch.distributed.barrier") def test_ddp_barrier_non_consecutive_device_ids(barrier_mock, tmpdir): """Test correct usage of barriers when device ids do not start at 0 or are not consecutive.""" diff --git a/tests/plugins/test_ddp_plugin_with_comm_hook.py b/tests/plugins/test_ddp_plugin_with_comm_hook.py index efcb089487c5b..7ee46fe0c52c3 100644 --- a/tests/plugins/test_ddp_plugin_with_comm_hook.py +++ b/tests/plugins/test_ddp_plugin_with_comm_hook.py @@ -26,7 +26,7 @@ import torch.distributed.algorithms.ddp_comm_hooks.post_localSGD_hook as post_localSGD -@RunIf(skip_windows=True, min_torch="1.9.0", min_gpus=2, special=True) +@RunIf(skip_windows=True, min_torch="1.9.0", min_gpus=2, standalone=True) def test_ddp_fp16_compress_comm_hook(tmpdir): """Test for DDP FP16 compress hook.""" model = BoringModel() @@ -46,7 +46,7 @@ def test_ddp_fp16_compress_comm_hook(tmpdir): assert trainer.state.finished, f"Training failed with {trainer.state}" -@RunIf(skip_windows=True, min_torch="1.9.0", min_gpus=2, special=True) +@RunIf(skip_windows=True, min_torch="1.9.0", min_gpus=2, standalone=True) def test_ddp_sgd_comm_hook(tmpdir): """Test for DDP FP16 compress hook.""" model = BoringModel() @@ -69,7 +69,7 @@ def test_ddp_sgd_comm_hook(tmpdir): assert trainer.state.finished, f"Training failed with {trainer.state}" -@RunIf(skip_windows=True, min_torch="1.9.0", min_gpus=2, special=True) +@RunIf(skip_windows=True, min_torch="1.9.0", min_gpus=2, standalone=True) def test_ddp_fp16_compress_wrap_sgd_comm_hook(tmpdir): """Test for DDP FP16 compress wrapper for SGD hook.""" model = BoringModel() @@ -93,7 +93,7 @@ def test_ddp_fp16_compress_wrap_sgd_comm_hook(tmpdir): assert trainer.state.finished, f"Training failed with {trainer.state}" -@RunIf(skip_windows=True, min_torch="1.9.0", min_gpus=2, special=True) +@RunIf(skip_windows=True, min_torch="1.9.0", min_gpus=2, standalone=True) def test_ddp_spawn_fp16_compress_comm_hook(tmpdir): """Test for DDP Spawn FP16 compress hook.""" model = BoringModel() @@ -110,7 +110,7 @@ def test_ddp_spawn_fp16_compress_comm_hook(tmpdir): assert trainer.state.finished, f"Training failed with {trainer.state}" -@RunIf(skip_windows=True, min_torch="1.10.0", min_gpus=2, special=True) +@RunIf(skip_windows=True, min_torch="1.10.0", min_gpus=2, standalone=True) def test_ddp_post_local_sgd_comm_hook(tmpdir): """Test for DDP post-localSGD hook.""" model = BoringModel() diff --git a/tests/plugins/test_deepspeed_plugin.py b/tests/plugins/test_deepspeed_plugin.py index 397803e1d8a17..7cca6f6724656 100644 --- a/tests/plugins/test_deepspeed_plugin.py +++ b/tests/plugins/test_deepspeed_plugin.py @@ -203,7 +203,7 @@ def test_deepspeed_defaults(tmpdir): assert isinstance(plugin.config["zero_optimization"], dict) -@RunIf(min_gpus=1, deepspeed=True, special=True) +@RunIf(min_gpus=1, deepspeed=True, standalone=True) def test_warn_deepspeed_ignored(tmpdir): class TestModel(BoringModel): def backward(self, loss: Tensor, optimizer: Optimizer, optimizer_idx: int, *args, **kwargs) -> None: @@ -259,7 +259,7 @@ def setup(self, trainer, pl_module, stage: Optional[str] = None) -> None: trainer.fit(model) -@RunIf(min_gpus=1, deepspeed=True, special=True) +@RunIf(min_gpus=1, deepspeed=True, standalone=True) def test_deepspeed_run_configure_optimizers(tmpdir): """Test end to end that deepspeed works with defaults (without ZeRO as that requires compilation), whilst using configure_optimizers for optimizers and schedulers.""" @@ -296,7 +296,7 @@ def configure_optimizers(self): _assert_save_model_is_equal(model, tmpdir, trainer) -@RunIf(min_gpus=1, deepspeed=True, special=True) +@RunIf(min_gpus=1, deepspeed=True, standalone=True) def test_deepspeed_config(tmpdir, deepspeed_zero_config): """Test to ensure deepspeed works correctly when passed a DeepSpeed config object including optimizers/schedulers and saves the model weights to load correctly.""" @@ -324,7 +324,7 @@ def on_train_start(self, trainer, pl_module) -> None: trainer.test(model) -@RunIf(min_gpus=1, deepspeed=True, special=True) +@RunIf(min_gpus=1, deepspeed=True, standalone=True) def test_deepspeed_custom_precision_params(tmpdir): """Ensure if we modify the FP16 parameters via the DeepSpeedPlugin, the deepspeed config contains these changes.""" @@ -386,7 +386,7 @@ def on_before_accelerator_backend_setup(self, trainer, pl_module) -> None: trainer.fit(model) -@RunIf(min_gpus=2, deepspeed=True, special=True) +@RunIf(min_gpus=2, deepspeed=True, standalone=True) def test_deepspeed_multigpu(tmpdir): """Test to ensure that DeepSpeed with multiple GPUs works and deepspeed distributed is initialized correctly.""" @@ -402,14 +402,14 @@ def test_deepspeed_multigpu(tmpdir): _assert_save_model_is_equal(model, tmpdir, trainer) -@RunIf(min_gpus=1, deepspeed=True, special=True) +@RunIf(min_gpus=1, deepspeed=True, standalone=True) def test_deepspeed_fp32_works(tmpdir): model = BoringModel() trainer = Trainer(default_root_dir=tmpdir, gpus=1, strategy="deepspeed_stage_3", fast_dev_run=True) trainer.fit(model) -@RunIf(min_gpus=2, deepspeed=True, special=True) +@RunIf(min_gpus=2, deepspeed=True, standalone=True) def test_deepspeed_stage_3_save_warning(tmpdir): """Test to ensure that DeepSpeed Stage 3 gives a warning when saving on rank zero.""" model = BoringModel() @@ -429,7 +429,7 @@ def test_deepspeed_stage_3_save_warning(tmpdir): trainer.save_checkpoint(checkpoint_path) -@RunIf(min_gpus=1, deepspeed=True, special=True) +@RunIf(min_gpus=1, deepspeed=True, standalone=True) def test_deepspeed_multigpu_single_file(tmpdir): """Test to ensure that DeepSpeed loads from a single file checkpoint.""" model = BoringModel() @@ -538,7 +538,7 @@ def training_step(self, batch, batch_idx): opt.step() -@RunIf(min_gpus=2, deepspeed=True, special=True) +@RunIf(min_gpus=2, deepspeed=True, standalone=True) def test_deepspeed_multigpu_stage_3(tmpdir, deepspeed_config): """Test to ensure ZeRO Stage 3 works with a parallel model.""" model = ModelParallelBoringModel() @@ -551,7 +551,7 @@ def test_deepspeed_multigpu_stage_3(tmpdir, deepspeed_config): _assert_save_model_is_equal(model, tmpdir, trainer) -@RunIf(min_gpus=2, deepspeed=True, special=True) +@RunIf(min_gpus=2, deepspeed=True, standalone=True) def test_deepspeed_multigpu_stage_3_manual_optimization(tmpdir, deepspeed_config): """Test to ensure ZeRO Stage 3 works with a parallel model.""" model = ModelParallelBoringModelManualOptim() @@ -600,14 +600,14 @@ def run_checkpoint_test(tmpdir: str, automatic_optimization: bool = True, accumu assert results[0]["test_acc"] > 0.7 -@RunIf(min_gpus=2, deepspeed=True, special=True) +@RunIf(min_gpus=2, deepspeed=True, standalone=True) def test_deepspeed_multigpu_stage_3_checkpointing(tmpdir): """Test to ensure with Stage 3 and multiple GPUs that we can save/load a model resuming from a checkpoint, and see convergence.""" run_checkpoint_test(tmpdir) -@RunIf(min_gpus=1, deepspeed=True, special=True) +@RunIf(min_gpus=1, deepspeed=True, standalone=True) def test_deepspeed_multigpu_stage_3_warns_resume_training(tmpdir): """Test to ensure with Stage 3 and multiple GPUs that we can resume from training, throwing a warning that the optimizer state and scheduler states cannot be restored.""" @@ -634,7 +634,7 @@ def test_deepspeed_multigpu_stage_3_warns_resume_training(tmpdir): trainer.fit(model, datamodule=dm, ckpt_path=checkpoint_path) -@RunIf(min_gpus=1, deepspeed=True, special=True) +@RunIf(min_gpus=1, deepspeed=True, standalone=True) def test_deepspeed_multigpu_stage_3_resume_training(tmpdir): """Test to ensure with Stage 3 and multiple GPUs that we can resume training.""" initial_model = ModelParallelClassificationModel() @@ -688,19 +688,19 @@ def on_train_batch_start( trainer.fit(model, datamodule=dm, ckpt_path=ck.best_model_path) -@RunIf(min_gpus=2, deepspeed=True, special=True) +@RunIf(min_gpus=2, deepspeed=True, standalone=True) def test_deepspeed_multigpu_stage_3_checkpointing_full_weights_manual(tmpdir): """Test to ensure with Stage 3 and multiple GPUs that we can save/load a model resuming from a checkpoint, where we save the full weights to one file.""" run_checkpoint_test(tmpdir, automatic_optimization=False, accumulate_grad_batches=1) -@RunIf(min_gpus=2, deepspeed=True, special=True) +@RunIf(min_gpus=2, deepspeed=True, standalone=True) def test_deepspeed_multigpu_stage_2_accumulated_grad_batches(tmpdir): _deepspeed_multigpu_stage_2_accumulated_grad_batches(tmpdir, offload_optimizer=False) -@RunIf(min_gpus=2, deepspeed=True, special=True) +@RunIf(min_gpus=2, deepspeed=True, standalone=True) def test_deepspeed_multigpu_stage_2_accumulated_grad_batches_offload_optimizer(tmpdir): _deepspeed_multigpu_stage_2_accumulated_grad_batches(tmpdir, offload_optimizer=True) @@ -741,7 +741,7 @@ def on_train_batch_start(self, trainer, pl_module: LightningModule, batch: Any, assert verification_callback.on_train_batch_start_called -@RunIf(min_gpus=2, deepspeed=True, special=True) +@RunIf(min_gpus=2, deepspeed=True, standalone=True) def test_deepspeed_multigpu_test(tmpdir): """Test to ensure we can use DeepSpeed with just test using ZeRO Stage 3.""" model = ModelParallelBoringModel() @@ -751,7 +751,7 @@ def test_deepspeed_multigpu_test(tmpdir): trainer.test(model) -@RunIf(min_gpus=1, deepspeed=True, special=True) +@RunIf(min_gpus=1, deepspeed=True, standalone=True) def test_deepspeed_multigpu_partial_partition_parameters(tmpdir): """Test to ensure that a module that defines a layer inside the ``__init__`` and ``configure_sharded_model`` correctly converts all parameters to float16 when ``precision=16`` and runs successfully.""" @@ -778,7 +778,7 @@ def on_train_epoch_start(self) -> None: trainer.fit(model) -@RunIf(min_gpus=1, deepspeed=True, special=True) +@RunIf(min_gpus=1, deepspeed=True, standalone=True) def test_deepspeed_multigpu_test_rnn(tmpdir): """Test to ensure that turning off explicit partitioning of the entire module for ZeRO Stage 3 works when training with certain layers which will crash with explicit partitioning.""" @@ -849,7 +849,7 @@ def _assert_save_model_is_equal(model, tmpdir, trainer): assert torch.equal(orig_param, saved_model_param) -@RunIf(min_gpus=2, deepspeed=True, special=True) +@RunIf(min_gpus=2, deepspeed=True, standalone=True) def test_deepspeed_multigpu_no_schedulers(tmpdir): """Test to ensure ZeRO Stage 3 works with a parallel model and no schedulers.""" model = ModelParallelBoringModelNoSchedulers() @@ -861,7 +861,7 @@ def test_deepspeed_multigpu_no_schedulers(tmpdir): _assert_save_model_is_equal(model, tmpdir, trainer) -@RunIf(min_gpus=1, deepspeed=True, special=True) +@RunIf(min_gpus=1, deepspeed=True, standalone=True) def test_deepspeed_skip_backward_raises(tmpdir): class TestModel(BoringModel): def training_step(self, batch, batch_idx): @@ -873,7 +873,7 @@ def training_step(self, batch, batch_idx): trainer.fit(model) -@RunIf(min_gpus=1, deepspeed=True, special=True) +@RunIf(min_gpus=1, deepspeed=True, standalone=True) def test_deepspeed_setup_train_dataloader(tmpdir): """Test DeepSpeed works when setup is required to call in the DataModule.""" @@ -911,7 +911,7 @@ def test_dataloader(self): @mock.patch("torch.optim.lr_scheduler.StepLR.step", autospec=True) -@RunIf(min_gpus=1, deepspeed=True, special=True) +@RunIf(min_gpus=1, deepspeed=True, standalone=True) def test_deepspeed_scheduler_step_count(mock_step): """Test to ensure that the scheduler is called the correct amount of times during training when scheduler is set to step.""" @@ -919,7 +919,7 @@ def test_deepspeed_scheduler_step_count(mock_step): @mock.patch("torch.optim.lr_scheduler.StepLR.step", autospec=True) -@RunIf(min_gpus=1, deepspeed=True, special=True) +@RunIf(min_gpus=1, deepspeed=True, standalone=True) def test_deepspeed_scheduler_step_count_epoch(mock_step): """Test to ensure that the scheduler is called the correct amount of times during training when scheduler is set to epoch.""" @@ -954,7 +954,7 @@ def configure_optimizers(self): assert mock_step.call_count == 1 + (max_epoch * limit_train_batches) -@RunIf(min_gpus=1, deepspeed=True, special=True) +@RunIf(min_gpus=1, deepspeed=True, standalone=True) def test_deepspeed_configure_gradient_clipping(tmpdir): """Test to ensure that a warning is raised when `LightningModule.configure_gradient_clipping` is overridden in case of deepspeed.""" @@ -975,7 +975,7 @@ def configure_gradient_clipping(self, optimizer, optimizer_idx, gradient_clip_va trainer.fit(model) -@RunIf(min_gpus=1, deepspeed=True, special=True) +@RunIf(min_gpus=1, deepspeed=True, standalone=True) def test_deepspeed_gradient_clip_by_value(tmpdir): """Test to ensure that an exception is raised when using `gradient_clip_algorithm='value'`.""" model = BoringModel() @@ -989,7 +989,7 @@ def test_deepspeed_gradient_clip_by_value(tmpdir): trainer.fit(model) -@RunIf(min_gpus=1, deepspeed=True, special=True) +@RunIf(min_gpus=1, deepspeed=True, standalone=True) def test_different_accumulate_grad_batches_fails(tmpdir): model = BoringModel() trainer = Trainer(default_root_dir=tmpdir, accumulate_grad_batches={1: 2}, gpus=1, strategy="deepspeed") @@ -999,7 +999,7 @@ def test_different_accumulate_grad_batches_fails(tmpdir): trainer.fit(model) -@RunIf(min_gpus=2, deepspeed=True, special=True) +@RunIf(min_gpus=2, deepspeed=True, standalone=True) def test_specific_gpu_device_id(tmpdir): class TestCallback(Callback): def on_train_start(self, trainer: Trainer, pl_module: LightningModule) -> None: @@ -1035,7 +1035,7 @@ def on_test_batch_start( trainer.test(model) -@RunIf(min_gpus=2, deepspeed=True, special=True, min_torch="1.10.0") +@RunIf(min_gpus=2, deepspeed=True, standalone=True, min_torch="1.10.0") def test_deepspeed_with_meta_device(tmpdir): with init_meta_context(): model = BoringModel() diff --git a/tests/plugins/test_sharded_plugin.py b/tests/plugins/test_sharded_plugin.py index e80b5d9f7621e..8a55633fb143e 100644 --- a/tests/plugins/test_sharded_plugin.py +++ b/tests/plugins/test_sharded_plugin.py @@ -175,7 +175,7 @@ def test_ddp_sharded_plugin_fit_ckpt_path_gpu_to_cpu(tmpdir): trainer.fit(model, ckpt_path=checkpoint_path) -@RunIf(skip_windows=True, special=True, fairscale=True) +@RunIf(skip_windows=True, standalone=True, fairscale=True) @pytest.mark.parametrize("trainer_kwargs", (dict(num_processes=2), pytest.param(dict(gpus=2), marks=RunIf(min_gpus=2)))) def test_ddp_sharded_plugin_test_multigpu(tmpdir, trainer_kwargs): """Test to ensure we can use validate and test without fit.""" @@ -201,7 +201,7 @@ def training_step(self, batch, batch_idx): return {"loss": loss} -@RunIf(skip_windows=True, special=True, fairscale=True, min_gpus=2) +@RunIf(skip_windows=True, standalone=True, fairscale=True, min_gpus=2) def test_ddp_sharded_plugin_manual_optimization_spawn(tmpdir): # todo (sean): this test has been split out as running both tests using parametrize causes "Address in use" model = ManualBoringModel() @@ -209,7 +209,7 @@ def test_ddp_sharded_plugin_manual_optimization_spawn(tmpdir): trainer.fit(model) -@RunIf(skip_windows=True, special=True, fairscale=True, min_gpus=2) +@RunIf(skip_windows=True, standalone=True, fairscale=True, min_gpus=2) def test_ddp_sharded_plugin_manual_optimization(tmpdir): model = ManualBoringModel() trainer = Trainer(default_root_dir=tmpdir, strategy="ddp_sharded", fast_dev_run=2, gpus=2) diff --git a/tests/profiler/test_profiler.py b/tests/profiler/test_profiler.py index 4d18648b6a7f1..126a9a6d1dee6 100644 --- a/tests/profiler/test_profiler.py +++ b/tests/profiler/test_profiler.py @@ -295,7 +295,7 @@ def test_advanced_profiler_cprofile_deepcopy(tmpdir): trainer.fit(model) -@RunIf(min_gpus=2, special=True) +@RunIf(min_gpus=2, standalone=True) def test_pytorch_profiler_trainer_ddp(tmpdir, pytorch_profiler): """Ensure that the profiler can be given to the training and default step are properly recorded.""" model = BoringModel() @@ -333,7 +333,7 @@ def test_pytorch_profiler_trainer_ddp(tmpdir, pytorch_profiler): assert any(f"{local_rank}-validation_step" in f for f in files) -@RunIf(special=True) +@RunIf(standalone=True) @pytest.mark.parametrize("fast_dev_run", [1, 2, 3, 4, 5]) @pytest.mark.parametrize("boring_model_cls", [ManualOptimBoringModel, BoringModel]) def test_pytorch_profiler_trainer_fit(fast_dev_run, boring_model_cls, tmpdir): @@ -428,7 +428,7 @@ def look_for_trace(trace_dir): assert look_for_trace(tmpdir) -@RunIf(min_gpus=1, special=True) +@RunIf(min_gpus=1, standalone=True) def test_pytorch_profiler_nested_emit_nvtx(tmpdir): """This test check emit_nvtx is correctly supported.""" profiler = PyTorchProfiler(use_cuda=True, emit_nvtx=True) diff --git a/tests/special_tests.sh b/tests/standalone_tests.sh similarity index 82% rename from tests/special_tests.sh rename to tests/standalone_tests.sh index 27abaa6cc62e3..49c608d53cfa1 100755 --- a/tests/special_tests.sh +++ b/tests/standalone_tests.sh @@ -15,12 +15,12 @@ set -e # this environment variable allows special tests to run -export PL_RUNNING_SPECIAL_TESTS=1 +export PL_RUN_STANDALONE_TESTS=1 # python arguments defaults='-m coverage run --source pytorch_lightning --append -m pytest --capture=no' -# find tests marked as `@RunIf(special=True)`. done manually instead of with pytest because it is faster -grep_output=$(grep --recursive --word-regexp 'tests' --regexp 'special=True' --include '*.py' --exclude 'tests/conftest.py') +# find tests marked as `@RunIf(standalone=True)`. done manually instead of with pytest because it is faster +grep_output=$(grep --recursive --word-regexp 'tests' --regexp 'standalone=True' --include '*.py' --exclude 'tests/conftest.py') # file paths, remove duplicates files=$(echo "$grep_output" | cut -f1 -d: | sort | uniq) @@ -47,10 +47,10 @@ for i in "${!parametrizations_arr[@]}"; do continue fi - # SPECIAL_PATTERN allows filtering the tests to run when debugging. - # use as `SPECIAL_PATTERN="foo_bar" ./special_tests.sh` to run only those + # STANDALONE_PATTERN allows filtering the tests to run when debugging. + # use as `STANDALONE_PATTERN="foo_bar" ./standalone_tests.sh` to run only those # test with `foo_bar` in their name - if [[ $parametrization != *$SPECIAL_PATTERN* ]]; then + if [[ $parametrization != *STANDALONE_PATTERN* ]]; then report+="Skipped\t$parametrization\n" continue fi @@ -74,7 +74,7 @@ fi # TODO: enable when CI uses torch>=1.9 # test deadlock is properly handled with TorchElastic. -# LOGS=$(PL_RUNNING_SPECIAL_TESTS=1 PL_RECONCILE_PROCESS=1 python -m torch.distributed.run --nproc_per_node=2 --max_restarts 0 -m coverage run --source pytorch_lightning -a tests/plugins/environments/torch_elastic_deadlock.py | grep "SUCCEEDED") +# LOGS=$(PL_RUN_STANDALONE_TESTS=1 PL_RECONCILE_PROCESS=1 python -m torch.distributed.run --nproc_per_node=2 --max_restarts 0 -m coverage run --source pytorch_lightning -a tests/plugins/environments/torch_elastic_deadlock.py | grep "SUCCEEDED") # if [ -z "$LOGS" ]; then # exit 1 # fi diff --git a/tests/trainer/logging_/test_train_loop_logging.py b/tests/trainer/logging_/test_train_loop_logging.py index 0ec61358d9408..6bfbaa9a7bcb1 100644 --- a/tests/trainer/logging_/test_train_loop_logging.py +++ b/tests/trainer/logging_/test_train_loop_logging.py @@ -434,7 +434,7 @@ def test_logging_sync_dist_true(tmpdir, devices): assert metrics["bar_3"] == 2 + int(use_multiple_devices) -@RunIf(min_gpus=2, special=True) +@RunIf(min_gpus=2, standalone=True) def test_logging_sync_dist_true_ddp(tmpdir): """Tests to ensure that the sync_dist flag works with ddp.""" diff --git a/tests/trainer/optimization/test_manual_optimization.py b/tests/trainer/optimization/test_manual_optimization.py index ba4fe915fadb1..dbbb4d9bdffa7 100644 --- a/tests/trainer/optimization/test_manual_optimization.py +++ b/tests/trainer/optimization/test_manual_optimization.py @@ -840,7 +840,7 @@ def train_manual_optimization(tmpdir, strategy, model_cls=TesManualOptimizationD assert not torch.equal(param.cpu().data, param_copy.data) -@RunIf(min_gpus=2, special=True) +@RunIf(min_gpus=2, standalone=True) def test_step_with_optimizer_closure_with_different_frequencies_ddp(tmpdir): """Tests that `step` works with optimizer_closure and different accumulated_gradient frequency.""" @@ -910,7 +910,7 @@ def dis_closure(): opt_dis.zero_grad() -@RunIf(min_gpus=2, special=True) +@RunIf(min_gpus=2, standalone=True) def test_step_with_optimizer_closure_with_different_frequencies_ddp_with_toggle_model(tmpdir): train_manual_optimization(tmpdir, "ddp", model_cls=TestManualOptimizationDDPModelToggleModel) diff --git a/tests/trainer/optimization/test_optimizers.py b/tests/trainer/optimization/test_optimizers.py index b2d88becb1ec7..4a99b3318f06f 100644 --- a/tests/trainer/optimization/test_optimizers.py +++ b/tests/trainer/optimization/test_optimizers.py @@ -537,7 +537,7 @@ def configure_optimizers(self): trainer.fit(model) -@RunIf(min_gpus=2, special=True) +@RunIf(min_gpus=2, standalone=True) def test_optimizer_state_on_device(tmpdir): """Test that optimizers that create state initially at instantiation still end up with the state on the GPU.""" diff --git a/tests/trainer/test_trainer.py b/tests/trainer/test_trainer.py index 6004d4540a85f..6416ef88fb210 100644 --- a/tests/trainer/test_trainer.py +++ b/tests/trainer/test_trainer.py @@ -1462,7 +1462,7 @@ def test_trainer_predict_cpu(tmpdir, datamodule, enable_progress_bar): predict(tmpdir, datamodule=datamodule, enable_progress_bar=enable_progress_bar) -@RunIf(min_gpus=2, special=True) +@RunIf(min_gpus=2, standalone=True) @pytest.mark.parametrize( "kwargs", [ @@ -1471,7 +1471,7 @@ def test_trainer_predict_cpu(tmpdir, datamodule, enable_progress_bar): {"strategy": "ddp", "devices": 2}, ], ) -def test_trainer_predict_special(tmpdir, kwargs): +def test_trainer_predict_standalone(tmpdir, kwargs): predict(tmpdir, accelerator="gpu", **kwargs) @@ -1899,7 +1899,7 @@ class CustomException(Exception): pass -@RunIf(min_gpus=2, special=True) +@RunIf(min_gpus=2, standalone=True) def test_ddp_terminate_when_deadlock_is_detected(tmpdir): """Test that DDP kills the remaining processes when only one rank is throwing an exception.""" diff --git a/tests/utilities/test_all_gather_grad.py b/tests/utilities/test_all_gather_grad.py index 0ecafa347e574..b7dfd5cbc3311 100644 --- a/tests/utilities/test_all_gather_grad.py +++ b/tests/utilities/test_all_gather_grad.py @@ -60,7 +60,7 @@ def test_all_gather_ddp_spawn(): torch.multiprocessing.spawn(_test_all_gather_ddp, args=(world_size,), nprocs=world_size) -@RunIf(min_gpus=2, skip_windows=True, special=True) +@RunIf(min_gpus=2, skip_windows=True, standalone=True) def test_all_gather_collection(tmpdir): class TestModel(BoringModel): @@ -111,7 +111,7 @@ def training_epoch_end(self, outputs) -> None: assert model.training_epoch_end_called -@RunIf(min_gpus=2, skip_windows=True, special=True) +@RunIf(min_gpus=2, skip_windows=True, standalone=True) def test_all_gather_sync_grads(tmpdir): class TestModel(BoringModel): diff --git a/tests/utilities/test_deepspeed_collate_checkpoint.py b/tests/utilities/test_deepspeed_collate_checkpoint.py index e85557b4e6056..0f36ada39227d 100644 --- a/tests/utilities/test_deepspeed_collate_checkpoint.py +++ b/tests/utilities/test_deepspeed_collate_checkpoint.py @@ -22,7 +22,7 @@ from tests.helpers.runif import RunIf -@RunIf(min_gpus=2, deepspeed=True, special=True) +@RunIf(min_gpus=2, deepspeed=True, standalone=True) def test_deepspeed_collate_checkpoint(tmpdir): """Test to ensure that with DeepSpeed Stage 3 we can collate the sharded checkpoints into a single file.""" model = BoringModel() diff --git a/tests/utilities/test_meta.py b/tests/utilities/test_meta.py index 581b949d9167f..1f386ac1ce0fe 100644 --- a/tests/utilities/test_meta.py +++ b/tests/utilities/test_meta.py @@ -31,7 +31,7 @@ def __init__(self, num_layers: int): self.layer = nn.Sequential(*[nn.Linear(1, 1) for _ in range(self.hparams.num_layers)]) -@RunIf(special=True, min_torch="1.10.0") +@RunIf(standalone=True, min_torch="1.10.0") def test_init_meta_context(): with init_meta_context(): diff --git a/tests/utilities/test_warnings.py b/tests/utilities/test_warnings.py index 6ef3793b5e0f3..af63bc905bce3 100644 --- a/tests/utilities/test_warnings.py +++ b/tests/utilities/test_warnings.py @@ -21,8 +21,8 @@ from pytorch_lightning.utilities.warnings import _warn, rank_zero_deprecation, rank_zero_warn, WarningCache -running_special = os.getenv("PL_RUNNING_SPECIAL_TESTS", "0") == "1" -if running_special: +standalone = os.getenv("PL_RUN_STANDALONE_TESTS", "0") == "1" +if standalone: stderr = StringIO() # recording