Fix load_from_checkpoint to return model on correct device (Lightning-AI#17308)

ryan597 · pre-commit-ci[bot] · carmocca · web-flow · commit e1ce887fde0a · 2023-04-15T02:08:00.000Z
Co-authored-by: pre-commit-ci[bot] &lt;66853113+pre-commit-ci[bot]@users.noreply.github.com&gt;
Co-authored-by: Carlos Mocholí &lt;carlossmocholi@gmail.com&gt;
Co-authored-by: Jirka Borovec &lt;6035284+Borda@users.noreply.github.com&gt;
diff --git a/src/lightning/pytorch/CHANGELOG.md b/src/lightning/pytorch/CHANGELOG.md
@@ -49,7 +49,8 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 
 ### Fixed
 
--
+- Fixed issue where `Model.load_from_checkpoint("checkpoint.ckpt", map_location=map_location)` would always return model on CPU ([#17308](https://github.com/Lightning-AI/lightning/pull/17308))
+
 
 
 ## [2.0.1.post0] - 2023-04-11
diff --git a/src/lightning/pytorch/core/saving.py b/src/lightning/pytorch/core/saving.py
@@ -21,7 +21,7 @@
 from copy import deepcopy
 from enum import Enum
 from pathlib import Path
-from typing import Any, Callable, cast, Dict, IO, Optional, Type, Union
+from typing import Any, Callable, Dict, IO, Optional, Type, Union
 from warnings import warn
 
 import yaml
@@ -56,8 +56,6 @@ def _load_from_checkpoint(
     strict: Optional[bool] = None,
     **kwargs: Any,
 ) -> Union["pl.LightningModule", "pl.LightningDataModule"]:
-    if map_location is None:
-        map_location = cast(_MAP_LOCATION_TYPE, lambda storage, loc: storage)
     with pl_legacy_patch():
         checkpoint = pl_load(checkpoint_path, map_location=map_location)
 
@@ -87,7 +85,14 @@ def _load_from_checkpoint(
     if issubclass(cls, pl.LightningDataModule):
         return _load_state(cls, checkpoint, **kwargs)
     if issubclass(cls, pl.LightningModule):
-        return _load_state(cls, checkpoint, strict=strict, **kwargs)
+        storage = _load_state(cls, checkpoint, strict=strict, **kwargs)
+        state_dict = checkpoint["state_dict"]
+        if not state_dict:
+            raise ValueError(f"The state dict in {checkpoint_path!r} contains no parameters.")
+        map_location = list(state_dict.values())[0].device
+        assert isinstance(storage, pl.LightningModule)
+        return storage.to(map_location)
+
     raise NotImplementedError(f"Unsupported {cls}")
 
 
diff --git a/tests/tests_pytorch/core/test_saving.py b/tests/tests_pytorch/core/test_saving.py
@@ -0,0 +1,57 @@
+import pytest
+import torch
+
+import lightning.pytorch as pl
+from lightning.pytorch.callbacks import ModelCheckpoint
+from lightning.pytorch.demos.boring_classes import BoringModel
+from tests_pytorch.helpers.runif import RunIf
+
+
+def create_boring_checkpoint(tmp_path, model, accelerator="cuda"):
+    checkpoint_callback = ModelCheckpoint(dirpath=tmp_path, filename="checkpoint")
+    trainer = pl.Trainer(
+        devices=1,
+        accelerator=accelerator,
+        max_epochs=1,
+        enable_model_summary=False,
+        enable_progress_bar=False,
+        callbacks=[checkpoint_callback],
+    )
+    trainer.fit(model)
+
+
+@pytest.mark.parametrize(
+    "map_location", (None, "cpu", torch.device("cpu"), lambda storage, loc: storage, {"cpu": "cpu"})
+)
+def test_load_from_checkpoint_map_location_cpu(tmp_path, map_location):
+    create_boring_checkpoint(tmp_path, BoringModel(), accelerator="cpu")
+    model = BoringModel.load_from_checkpoint(f"{tmp_path}/checkpoint.ckpt", map_location=map_location)
+    assert model.device.type == "cpu"
+
+
+@RunIf(min_cuda_gpus=1)
+@pytest.mark.parametrize(
+    "map_location", (None, "cuda", torch.device("cuda"), lambda storage, loc: storage.cuda(), {"cpu": "cuda"})
+)
+def test_load_from_checkpoint_map_location_gpu(tmp_path, map_location):
+    create_boring_checkpoint(tmp_path, BoringModel(), accelerator="cuda")
+    model = BoringModel.load_from_checkpoint(f"{tmp_path}/checkpoint.ckpt", map_location=map_location)
+    assert model.device.type == "cuda"
+
+
+@RunIf(min_cuda_gpus=1)
+@pytest.mark.parametrize("map_location", ("cpu", torch.device("cpu"), lambda storage, loc: storage, {"cuda": "cpu"}))
+def test_load_from_checkpoint_map_location_gpu_to_cpu(tmp_path, map_location):
+    create_boring_checkpoint(tmp_path, BoringModel(), accelerator="cpu")
+    model = BoringModel.load_from_checkpoint(f"{tmp_path}/checkpoint.ckpt", map_location=map_location)
+    assert model.device.type == "cpu"
+
+
+@RunIf(min_cuda_gpus=1)
+@pytest.mark.parametrize(
+    "map_location", ("cuda", torch.device("cuda"), lambda storage, loc: storage.cuda(), {"cpu": "cuda"})
+)
+def test_load_from_checkpoint_map_location_cpu_to_gpu(tmp_path, map_location):
+    create_boring_checkpoint(tmp_path, BoringModel(), accelerator="cpu")
+    model = BoringModel.load_from_checkpoint(f"{tmp_path}/checkpoint.ckpt", map_location=map_location)
+    assert model.device.type == "cuda"
diff --git a/tests/tests_pytorch/strategies/test_fsdp.py b/tests/tests_pytorch/strategies/test_fsdp.py
@@ -150,7 +150,7 @@ def _assert_save_equality(trainer, ckpt_path, cls=TestFSDPModel):
 
         # Assert model parameters are identical after loading
         for ddp_param, shard_param in zip(model_state_dict.values(), saved_model.state_dict().values()):
-            assert torch.equal(ddp_param.float().cpu(), shard_param)
+            assert torch.equal(ddp_param, shard_param)
 
 
 @RunIf(min_torch="1.12")