Merge branch 'master' into tests/remove-random-from-conda

Lightning-AI · Nov 18, 2021 · 73225bb · 73225bb
2 parents 4aa3312 + 261ea90
commit 73225bb
Show file tree

Hide file tree

Showing 41 changed files with 338 additions and 302 deletions.
diff --git a/.github/workflows/ci_test-base.yml b/.github/workflows/ci_test-base.yml
@@ -19,7 +19,7 @@ jobs:
         # this will install stable torch
         python-version: [3.9]
 
-    # Timeout: https://stackoverflow.com/a/59076067/4521646
+    # lower timeout as this should run very quickly
     timeout-minutes: 20
     steps:
     - uses: actions/checkout@v2

diff --git a/.github/workflows/ci_test-conda.yml b/.github/workflows/ci_test-conda.yml
@@ -17,7 +17,6 @@ jobs:
         python-version: ["3.8"]  # previous to last Python version as that one is already used in test-full
         pytorch-version: ["1.7", "1.8", "1.9", "1.10"]  # nightly: add when there's a release candidate
 
-    # Timeout: https://stackoverflow.com/a/59076067/4521646
     timeout-minutes: 35
     steps:
     - uses: actions/checkout@v2

diff --git a/.github/workflows/ci_test-full.yml b/.github/workflows/ci_test-full.yml
@@ -29,8 +29,6 @@ jobs:
           # nightly: add when there's a release candidate
           #- {os: ubuntu-20.04, python-version: "3.10", requires: "latest", release: "pre"}
 
-    # Timeout: https://stackoverflow.com/a/59076067/4521646
-    # TODO: the macOS is taking too long, probably caching did not work...
     timeout-minutes: 40
 
     steps:

diff --git a/.github/workflows/probot-auto-cc.yml b/.github/workflows/probot-auto-cc.yml
@@ -9,7 +9,7 @@ on:
 jobs:
   auto-cc:
     runs-on: ubuntu-latest
-    if: github.event_name == "issue" || github.event.pull_request.draft == false
+    if: github.event_name == 'issue' || github.event.pull_request.draft == false
     steps:
       - uses: carmocca/probot@v1
         env:

diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -142,32 +142,32 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 
 ### Fixed
 
-- Fixed an issue where class or init-only variables of dataclasses were passed to the dataclass constructor in `utilities.apply_to_collection` ([#9702](https://github.com/PyTorchLightning/pytorch-lightning/issues/9702))
 
+-
 
-- Fixed `CombinedLoader` and `max_size_cycle` didn't receive a `DistributedSampler` ([#10374](https://github.com/PyTorchLightning/pytorch-lightning/issues/10374))
 
+-
 
-- Fixed scripting causing false positive deprecation warnings ([#10470](https://github.com/PyTorchLightning/pytorch-lightning/pull/10470), [#10555](https://github.com/PyTorchLightning/pytorch-lightning/pull/10555))
 
+-
 
-- Fixed `isinstance` not working with `init_meta_context`, materialized model not being moved to the device ([#10493](https://github.com/PyTorchLightning/metrics/pull/10493))
 
 
-- Fixed an issue that prevented the Trainer to shutdown workers when execution is interrupted due to failure([#10463](https://github.com/PyTorchLightning/pytorch-lightning/issues/10463))
+## [1.5.2] - 2021-11-16
 
+### Fixed
 
+- Fixed `CombinedLoader` and `max_size_cycle` didn't receive a `DistributedSampler` ([#10374](https://github.com/PyTorchLightning/pytorch-lightning/issues/10374))
+- Fixed an issue where class or init-only variables of dataclasses were passed to the dataclass constructor in `utilities.apply_to_collection` ([#9702](https://github.com/PyTorchLightning/pytorch-lightning/issues/9702))
+- Fixed `isinstance` not working with `init_meta_context`, materialized model not being moved to the device ([#10493](https://github.com/PyTorchLightning/metrics/pull/10493))
+- Fixed an issue that prevented the Trainer to shutdown workers when execution is interrupted due to failure([#10463](https://github.com/PyTorchLightning/pytorch-lightning/issues/10463))
 - Squeeze the early stopping monitor to remove empty tensor dimensions ([#10461](https://github.com/PyTorchLightning/pytorch-lightning/issues/10461))
-
-
 - Fixed sampler replacement logic with `overfit_batches` to only replace the sample when `SequentialSampler` is not used ([#10486](https://github.com/PyTorchLightning/pytorch-lightning/issues/10486))
-
-
+- Fixed scripting causing false positive deprecation warnings ([#10470](https://github.com/PyTorchLightning/pytorch-lightning/pull/10470), [#10555](https://github.com/PyTorchLightning/pytorch-lightning/pull/10555))
+- Do not fail if batch size could not be inferred for logging when using DeepSpeed ([#10438](https://github.com/PyTorchLightning/pytorch-lightning/issues/10438))
 - Fixed propagation of device and dtype information to submodules of LightningLite when they inherit from `DeviceDtypeModuleMixin` ([#10559](https://github.com/PyTorchLightning/pytorch-lightning/issues/10559))
 
 
--
-
 ## [1.5.1] - 2021-11-09
 
 ### Fixed

diff --git a/benchmarks/test_basic_parity.py b/benchmarks/test_basic_parity.py
@@ -160,8 +160,8 @@ def lightning_loop(cls_model, idx, device_type: str = "cuda", num_epochs=10):
         max_epochs=num_epochs if idx > 0 else 1,
         enable_progress_bar=False,
         enable_model_summary=False,
+        enable_checkpointing=False,
         gpus=1 if device_type == "cuda" else 0,
-        checkpoint_callback=False,
         logger=False,
         replace_sampler_ddp=False,
     )

diff --git a/pl_examples/basic_examples/mnist_datamodule.py b/pl_examples/basic_examples/mnist_datamodule.py
@@ -81,7 +81,6 @@ def __init__(
             )
             num_workers = 0
 
-        self.dims = (1, 28, 28)
         self.data_dir = data_dir
         self.val_split = val_split
         self.num_workers = num_workers
@@ -90,7 +89,6 @@ def __init__(
         self.batch_size = batch_size
         self.dataset_train = ...
         self.dataset_val = ...
-        self.test_transforms = self.default_transforms
 
     @property
     def num_classes(self):
@@ -134,7 +132,7 @@ def val_dataloader(self):
 
     def test_dataloader(self):
         """MNIST test set uses the test split."""
-        extra = dict(transform=self.test_transforms) if self.test_transforms else {}
+        extra = dict(transform=self.default_transforms) if self.default_transforms else {}
         dataset = MNIST(self.data_dir, train=False, download=False, **extra)
         loader = DataLoader(
             dataset,

diff --git a/pytorch_lightning/__init__.py b/pytorch_lightning/__init__.py
@@ -1,7 +1,6 @@
 """Root package info."""
 
 import logging
-import os
 
 from pytorch_lightning.__about__ import *  # noqa: F401, F403
 
@@ -14,9 +13,6 @@
     _logger.addHandler(logging.StreamHandler())
     _logger.propagate = False
 
-_PACKAGE_ROOT = os.path.dirname(__file__)
-_PROJECT_ROOT = os.path.dirname(_PACKAGE_ROOT)
-
 from pytorch_lightning.callbacks import Callback  # noqa: E402
 from pytorch_lightning.core import LightningDataModule, LightningModule  # noqa: E402
 from pytorch_lightning.trainer import Trainer  # noqa: E402

diff --git a/pytorch_lightning/plugins/environments/slurm_environment.py b/pytorch_lightning/plugins/environments/slurm_environment.py
@@ -28,6 +28,11 @@ class SLURMEnvironment(ClusterEnvironment):
     def creates_processes_externally(self) -> bool:
         return True
 
+    @staticmethod
+    def detect() -> bool:
+        """Returns ``True`` if the current process was launched on a SLURM cluster."""
+        return "SLURM_NTASKS" in os.environ
+
     @property
     def main_address(self) -> str:
         # figure out the root node addr

diff --git a/pytorch_lightning/trainer/connectors/accelerator_connector.py b/pytorch_lightning/trainer/connectors/accelerator_connector.py
@@ -134,7 +134,6 @@ def __init__(
         self.precision = precision
         self.amp_type = amp_type.lower() if isinstance(amp_type, str) else None
         self.amp_level = amp_level
-        self._is_slurm_managing_tasks = False
 
         self._precision_plugin: Optional[PrecisionPlugin] = None
         self._training_type_plugin: Optional[TrainingTypePlugin] = None
@@ -167,7 +166,6 @@ def __init__(
         self.handle_given_plugins()
         self._set_distrib_type_if_training_type_plugin_passed()
 
-        self._configure_slurm_ddp()
         self._cluster_environment = self.select_cluster_environment()
 
         self.update_device_type_if_ipu_plugin()
@@ -703,15 +701,15 @@ def select_training_type_plugin(self) -> TrainingTypePlugin:
                 cluster_environment=self.select_cluster_environment(), parallel_devices=self.parallel_devices
             )
         elif self.use_ddp:
-            use_slurm_ddp = self.use_ddp and self._is_slurm_managing_tasks
+            use_slurm_ddp = self.use_ddp and self._is_slurm_managing_tasks()
             use_torchelastic_ddp = self.use_ddp and TorchElasticEnvironment.is_using_torchelastic()
             use_kubeflow_ddp = self.use_ddp and KubeflowEnvironment.is_using_kubeflow()
             use_ddp_spawn = self._distrib_type == _StrategyType.DDP_SPAWN
             use_ddp_cpu_spawn = use_ddp_spawn and self.use_cpu
             use_tpu_spawn = self.use_tpu and self._distrib_type == _StrategyType.TPU_SPAWN
             use_ddp_cpu_torch_elastic = use_ddp_cpu_spawn and TorchElasticEnvironment.is_using_torchelastic()
             use_ddp_cpu_kubeflow = use_ddp_cpu_spawn and KubeflowEnvironment.is_using_kubeflow()
-            use_ddp_cpu_slurm = use_ddp_cpu_spawn and self._is_slurm_managing_tasks
+            use_ddp_cpu_slurm = use_ddp_cpu_spawn and self._is_slurm_managing_tasks()
             use_ddp_sharded = self._distrib_type == _StrategyType.DDP_SHARDED
             use_ddp_sharded_spawn = self._distrib_type == _StrategyType.DDP_SHARDED_SPAWN
             use_ddp_fully_sharded = self._distrib_type == _StrategyType.DDP_FULLY_SHARDED
@@ -807,8 +805,9 @@ def select_accelerator(self) -> Accelerator:
     def select_cluster_environment(self) -> ClusterEnvironment:
         if self._cluster_environment is not None:
             return self._cluster_environment
-        if self._is_slurm_managing_tasks:
+        if self._is_slurm_managing_tasks():
             env = SLURMEnvironment()
+            rank_zero_info("Multiprocessing is handled by SLURM.")
         elif TorchElasticEnvironment.is_using_torchelastic():
             env = TorchElasticEnvironment()
         elif KubeflowEnvironment.is_using_kubeflow():
@@ -990,34 +989,6 @@ def update_device_type_if_training_type_plugin_passed(self) -> None:
                 elif self.has_gpu:
                     self._device_type = DeviceType.GPU
 
-    def _configure_slurm_ddp(self):
-        # extract SLURM flag vars
-        # whenever we have the correct number of tasks, we let slurm manage processes
-        # otherwise we launch the required number of processes
-        if self.use_ddp or self.use_ddp2:
-            num_requested_gpus = self.num_gpus * self.num_nodes
-            num_slurm_tasks = 0
-            try:
-                num_slurm_tasks = int(os.environ["SLURM_NTASKS"])
-                self._is_slurm_managing_tasks = num_slurm_tasks == num_requested_gpus
-
-                # enable slurm cpu
-                if num_requested_gpus == 0:
-                    self._is_slurm_managing_tasks = num_slurm_tasks == self.num_processes
-
-                # in interactive mode we don't manage tasks
-                job_name = os.environ["SLURM_JOB_NAME"]
-                if job_name == "bash":
-                    self._is_slurm_managing_tasks = False
-
-            except Exception:
-                # likely not on slurm, so set the slurm managed flag to false
-                self._is_slurm_managing_tasks = False
-
-        # notify user the that slurm is managing tasks
-        if self._is_slurm_managing_tasks:
-            rank_zero_info("Multi-processing is handled by Slurm.")
-
     def _set_distrib_type_if_training_type_plugin_passed(self):
         # This is required as when `TrainingTypePlugin` instance is passed to either `strategy`
         # or `plugins` flag, `AcceleratorConnector.set_distributed_mode` is not required to be
@@ -1026,3 +997,24 @@ def _set_distrib_type_if_training_type_plugin_passed(self):
             return
         if self._training_type_plugin is not None:
             self._distrib_type = getattr(self._training_type_plugin, "distributed_backend", None)
+
+    def _is_slurm_managing_tasks(self) -> bool:
+        """Returns whether we let SLURM manage the processes or not.
+
+        Returns ``True`` if and only if these conditions match:
+
+            - A SLURM cluster is detected
+            - A distributed plugin is being used
+            - The process is not launching in interactive mode
+            - The number of tasks in SLURM matches the requested number of devices and nodes in the Trainer
+        """
+        if (
+            (not self.use_ddp and not self.use_ddp2)
+            or not SLURMEnvironment.detect()
+            or os.environ.get("SLURM_JOB_NAME") == "bash"  # in interactive mode we don't manage tasks
+        ):
+            return False
+
+        total_requested_devices = (self.num_gpus or self.num_processes) * self.num_nodes
+        num_slurm_tasks = int(os.environ["SLURM_NTASKS"], 0)
+        return num_slurm_tasks == total_requested_devices
diff --git a/pytorch_lightning/utilities/enums.py b/pytorch_lightning/utilities/enums.py
@@ -103,13 +103,6 @@ def supported_types() -> List[str]:
 class DistributedType(LightningEnum, metaclass=_OnAccessEnumMeta):
     """Define type of training strategy.
 
-    >>> # you can match the type with string
-    >>> DistributedType.DDP == 'ddp'
-    True
-    >>> # which is case invariant
-    >>> DistributedType.DDP2 in ('ddp2', )
-    True
-
     Deprecated since v1.6.0 and will be removed in v1.8.0.
 
     Use `_StrategyType` instead.

diff --git a/setup.cfg b/setup.cfg
@@ -23,10 +23,17 @@ python_files =
     test_*.py
 # doctest_plus = disabled
 addopts =
-    --strict
+    --strict-markers
     --doctest-modules
     --color=yes
     --disable-pytest-warnings
+filterwarnings =
+    # error out on our deprecation warnings - ensures the code and tests are kept up-to-date
+    error::pytorch_lightning.utilities.warnings.LightningDeprecationWarning
+    # warnings from deprecated modules on import
+    # TODO: remove in 1.7
+    ignore::pytorch_lightning.utilities.warnings.LightningDeprecationWarning:pytorch_lightning.core.decorators
+    ignore::pytorch_lightning.utilities.warnings.LightningDeprecationWarning:pytorch_lightning.core.memory
 
 junit_duration_report = call
 

diff --git a/tests/accelerators/test_accelerator_connector.py b/tests/accelerators/test_accelerator_connector.py
@@ -103,7 +103,7 @@ def test_accelerator_choice_ddp_spawn(cuda_available_mock, device_count_mock):
 def test_accelerator_choice_ddp_slurm(set_device_mock, device_count_mock, setup_distributed_mock):
     class CB(Callback):
         def on_fit_start(self, trainer, pl_module):
-            assert trainer._accelerator_connector._is_slurm_managing_tasks
+            assert trainer._accelerator_connector._is_slurm_managing_tasks()
             assert isinstance(trainer.accelerator, GPUAccelerator)
             assert isinstance(trainer.training_type_plugin, DDPPlugin)
             assert isinstance(trainer.training_type_plugin.cluster_environment, SLURMEnvironment)
@@ -136,7 +136,7 @@ def on_fit_start(self, trainer, pl_module):
 def test_accelerator_choice_ddp2_slurm(set_device_mock, device_count_mock, setup_distributed_mock):
     class CB(Callback):
         def on_fit_start(self, trainer, pl_module):
-            assert trainer._accelerator_connector._is_slurm_managing_tasks
+            assert trainer._accelerator_connector._is_slurm_managing_tasks()
             assert isinstance(trainer.accelerator, GPUAccelerator)
             assert isinstance(trainer.training_type_plugin, DDP2Plugin)
             assert isinstance(trainer.training_type_plugin.cluster_environment, SLURMEnvironment)
@@ -323,7 +323,7 @@ def on_fit_start(self, trainer, pl_module):
 def test_accelerator_choice_ddp_cpu_slurm(device_count_mock, setup_distributed_mock):
     class CB(Callback):
         def on_fit_start(self, trainer, pl_module):
-            assert trainer._accelerator_connector._is_slurm_managing_tasks
+            assert trainer._accelerator_connector._is_slurm_managing_tasks()
             assert isinstance(trainer.accelerator, CPUAccelerator)
             assert isinstance(trainer.training_type_plugin, DDPPlugin)
             assert isinstance(trainer.training_type_plugin.cluster_environment, SLURMEnvironment)
@@ -343,7 +343,7 @@ def test_accelerator_choice_ddp_cpu_and_strategy(tmpdir):
     _test_accelerator_choice_ddp_cpu_and_strategy(tmpdir, ddp_strategy_class=DDPPlugin)
 
 
-@RunIf(skip_windows=True)
+@RunIf(skip_windows=True, skip_49370=True)
 def test_accelerator_choice_ddp_cpu_and_strategy_spawn(tmpdir):
     """Test that accelerator="ddp_cpu" can work together with an instance of DDPPSpawnPlugin."""
     _test_accelerator_choice_ddp_cpu_and_strategy(tmpdir, ddp_strategy_class=DDPSpawnPlugin)
@@ -791,7 +791,7 @@ def test_strategy_choice_ddp_spawn(cuda_available_mock, device_count_mock):
 def test_strategy_choice_ddp_slurm(setup_distributed_mock, strategy):
     class CB(Callback):
         def on_fit_start(self, trainer, pl_module):
-            assert trainer._accelerator_connector._is_slurm_managing_tasks
+            assert trainer._accelerator_connector._is_slurm_managing_tasks()
             assert isinstance(trainer.accelerator, GPUAccelerator)
             assert isinstance(trainer.training_type_plugin, DDPPlugin)
             assert isinstance(trainer.training_type_plugin.cluster_environment, SLURMEnvironment)
@@ -824,7 +824,7 @@ def on_fit_start(self, trainer, pl_module):
 def test_strategy_choice_ddp2_slurm(set_device_mock, device_count_mock, setup_distributed_mock, strategy):
     class CB(Callback):
         def on_fit_start(self, trainer, pl_module):
-            assert trainer._accelerator_connector._is_slurm_managing_tasks
+            assert trainer._accelerator_connector._is_slurm_managing_tasks()
             assert isinstance(trainer.accelerator, GPUAccelerator)
             assert isinstance(trainer.training_type_plugin, DDP2Plugin)
             assert isinstance(trainer.training_type_plugin.cluster_environment, SLURMEnvironment)
@@ -1008,7 +1008,7 @@ def on_fit_start(self, trainer, pl_module):
 def test_strategy_choice_ddp_cpu_slurm(device_count_mock, setup_distributed_mock, strategy):
     class CB(Callback):
         def on_fit_start(self, trainer, pl_module):
-            assert trainer._accelerator_connector._is_slurm_managing_tasks
+            assert trainer._accelerator_connector._is_slurm_managing_tasks()
             assert isinstance(trainer.accelerator, CPUAccelerator)
             assert isinstance(trainer.training_type_plugin, DDPPlugin)
             assert isinstance(trainer.training_type_plugin.cluster_environment, SLURMEnvironment)

diff --git a/tests/accelerators/test_cpu.py b/tests/accelerators/test_cpu.py
@@ -35,7 +35,7 @@ def setup_optimizers_in_pre_dispatch(self) -> bool:
             return delay_dispatch
 
     model = TestModel()
-    trainer = Trainer(default_root_dir=tmpdir, fast_dev_run=True, plugins=CustomPlugin(device=torch.device("cpu")))
+    trainer = Trainer(default_root_dir=tmpdir, fast_dev_run=True, strategy=CustomPlugin(device=torch.device("cpu")))
     trainer.fit(model)
 
 

diff --git a/tests/accelerators/test_ddp.py b/tests/accelerators/test_ddp.py
@@ -109,16 +109,8 @@ def setup(self, stage: Optional[str] = None) -> None:
 
 
 @RunIf(min_gpus=2, min_torch="1.8.1", special=True)
-def test_ddp_wrapper_16(tmpdir):
-    _test_ddp_wrapper(tmpdir, precision=16)
-
-
-@RunIf(min_gpus=2, min_torch="1.8.1", special=True)
-def test_ddp_wrapper_32(tmpdir):
-    _test_ddp_wrapper(tmpdir, precision=32)
-
-
-def _test_ddp_wrapper(tmpdir, precision):
+@pytest.mark.parametrize("precision", (16, 32))
+def test_ddp_wrapper(tmpdir, precision):
     """Test parameters to ignore are carried over for DDP."""
 
     class WeirdModule(torch.nn.Module):