From 5e0e02b79e07b1c7f76d75d2fcb669dbc13e958e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= <aedu.waelchli@gmail.com>
Date: Sat, 27 Apr 2024 07:24:07 +0200
Subject: [PATCH] Remove support for PyTorch 1.13 (#19706)

---
 .github/checkgroup.yml                        |  46 ++--
 .github/workflows/ci-tests-fabric.yml         |  24 +-
 .github/workflows/ci-tests-pytorch.yml        |  24 +-
 .github/workflows/docker-build.yml            |   7 +-
 dockers/release/Dockerfile                    |   6 +-
 requirements/fabric/base.txt                  |   2 +-
 requirements/fabric/examples.txt              |   2 +-
 requirements/fabric/strategies.txt            |   2 +-
 requirements/pytorch/base.txt                 |   2 +-
 requirements/pytorch/examples.txt             |   2 +-
 requirements/pytorch/strategies.txt           |   2 +-
 src/lightning/fabric/CHANGELOG.md             |   2 +-
 src/lightning/fabric/accelerators/cuda.py     | 212 +-----------------
 src/lightning/fabric/fabric.py                |  31 +--
 src/lightning/fabric/plugins/precision/amp.py |   5 +-
 .../fabric/plugins/precision/fsdp.py          |  10 +-
 src/lightning/fabric/strategies/fsdp.py       |  65 ++----
 src/lightning/fabric/strategies/strategy.py   |   7 +-
 src/lightning/fabric/strategies/xla_fsdp.py   |  11 -
 src/lightning/fabric/utilities/imports.py     |   3 +-
 src/lightning/fabric/utilities/load.py        |   7 +-
 .../fabric/utilities/testing/_runif.py        |   3 +-
 src/lightning/fabric/utilities/types.py       |  50 +----
 src/lightning/fabric/wrappers.py              |  27 +--
 src/lightning/pytorch/CHANGELOG.md            |   2 +-
 .../callbacks/stochastic_weight_avg.py        |   2 +-
 src/lightning/pytorch/cli.py                  |  10 +-
 src/lightning/pytorch/core/hooks.py           |   4 +-
 src/lightning/pytorch/core/module.py          |   7 +-
 src/lightning/pytorch/core/optimizer.py       |   3 +-
 src/lightning/pytorch/demos/boring_classes.py |   4 +-
 .../pytorch/plugins/precision/amp.py          |   5 +-
 .../pytorch/plugins/precision/fsdp.py         |  10 +-
 src/lightning/pytorch/strategies/deepspeed.py |   3 +-
 src/lightning/pytorch/strategies/fsdp.py      |  43 +---
 src/lightning/pytorch/strategies/strategy.py  |   6 +-
 .../connectors/logger_connector/result.py     |   6 +-
 src/lightning/pytorch/trainer/trainer.py      |  15 +-
 src/lightning/pytorch/tuner/lr_finder.py      |  21 +-
 src/lightning/pytorch/utilities/compile.py    |  22 +-
 .../utilities/model_summary/model_summary.py  |   6 +-
 .../pytorch/utilities/testing/_runif.py       |   3 +-
 src/lightning/pytorch/utilities/types.py      |  11 +-
 tests/tests_fabric/accelerators/test_cuda.py  |  14 --
 .../plugins/precision/test_fsdp.py            |  22 +-
 tests/tests_fabric/strategies/test_ddp.py     |   3 +-
 .../strategies/test_ddp_integration.py        |   9 +-
 tests/tests_fabric/strategies/test_fsdp.py    |  29 ---
 .../strategies/test_fsdp_integration.py       |  29 +--
 .../tests_fabric/strategies/test_strategy.py  |   7 +-
 .../tests_fabric/strategies/test_xla_fsdp.py  |   6 +-
 .../strategies/test_xla_fsdp_integration.py   |   6 +-
 tests/tests_fabric/test_fabric.py             |  18 +-
 tests/tests_fabric/test_wrappers.py           |   8 +-
 tests/tests_fabric/utilities/test_load.py     |   7 -
 .../core/test_lightning_module.py             |   4 -
 tests/tests_pytorch/models/test_hooks.py      |  13 +-
 .../plugins/precision/test_fsdp.py            |  22 +-
 tests/tests_pytorch/strategies/test_common.py |   4 +-
 tests/tests_pytorch/strategies/test_ddp.py    |   3 +-
 .../strategies/test_ddp_integration.py        |   5 +-
 tests/tests_pytorch/strategies/test_fsdp.py   | 123 +++-------
 .../optimization/test_manual_optimization.py  |   7 +-
 tests/tests_pytorch/trainer/test_trainer.py   |   9 -
 tests/tests_pytorch/utilities/test_compile.py |   4 +
 .../utilities/test_model_summary.py           |   5 -
 66 files changed, 221 insertions(+), 871 deletions(-)

diff --git a/.github/checkgroup.yml b/.github/checkgroup.yml
index 7a89f64b14181..37f1e3cd844d2 100644
--- a/.github/checkgroup.yml
+++ b/.github/checkgroup.yml
@@ -19,21 +19,21 @@ subprojects:
       - "!*.md"
       - "!**/*.md"
     checks:
-      - "pl-cpu (macOS-11, lightning, 3.8, 1.13, oldest)"
-      - "pl-cpu (macOS-11, lightning, 3.10, 1.13)"
+      - "pl-cpu (macOS-11, lightning, 3.8, 2.0, oldest)"
+      - "pl-cpu (macOS-11, lightning, 3.10, 2.0)"
       - "pl-cpu (macOS-11, lightning, 3.10, 2.1)"
       - "pl-cpu (macOS-11, lightning, 3.10, 2.2)"
-      - "pl-cpu (ubuntu-20.04, lightning, 3.8, 1.13, oldest)"
-      - "pl-cpu (ubuntu-20.04, lightning, 3.10, 1.13)"
+      - "pl-cpu (ubuntu-20.04, lightning, 3.8, 2.0, oldest)"
+      - "pl-cpu (ubuntu-20.04, lightning, 3.10, 2.0)"
       - "pl-cpu (ubuntu-20.04, lightning, 3.10, 2.1)"
       - "pl-cpu (ubuntu-20.04, lightning, 3.10, 2.2)"
-      - "pl-cpu (windows-2022, lightning, 3.8, 1.13, oldest)"
-      - "pl-cpu (windows-2022, lightning, 3.10, 1.13)"
+      - "pl-cpu (windows-2022, lightning, 3.8, 2.0, oldest)"
+      - "pl-cpu (windows-2022, lightning, 3.10, 2.0)"
       - "pl-cpu (windows-2022, lightning, 3.10, 2.1)"
       - "pl-cpu (windows-2022, lightning, 3.10, 2.2)"
-      - "pl-cpu (macOS-11, pytorch, 3.8, 1.13)"
-      - "pl-cpu (ubuntu-20.04, pytorch, 3.8, 1.13)"
-      - "pl-cpu (windows-2022, pytorch, 3.8, 1.13)"
+      - "pl-cpu (macOS-11, pytorch, 3.8, 2.0)"
+      - "pl-cpu (ubuntu-20.04, pytorch, 3.8, 2.0)"
+      - "pl-cpu (windows-2022, pytorch, 3.8, 2.0)"
       - "pl-cpu (macOS-12, pytorch, 3.11, 2.0)"
       - "pl-cpu (macOS-12, pytorch, 3.11, 2.1)"
       - "pl-cpu (ubuntu-22.04, pytorch, 3.11, 2.0)"
@@ -140,15 +140,17 @@ subprojects:
       - "!*.md"
       - "!**/*.md"
     checks:
-      - "build-cuda (3.9, 1.13, 11.8.0)"
-      - "build-cuda (3.9, 1.13, 12.0.1)"
       - "build-cuda (3.10, 2.0, 11.8.0)"
       - "build-cuda (3.10, 2.1, 12.1.0)"
+      - "build-cuda (3.10, 2.2, 12.1.0)"
+      - "build-cuda (3.11, 2.1, 12.1.0)"
+      - "build-cuda (3.11, 2.2, 12.1.0)"
       #- "build-NGC"
-      - "build-pl (3.9, 1.13, 11.8.0)"
-      - "build-pl (3.9, 1.13, 12.0.1)"
       - "build-pl (3.10, 2.0, 11.8.0)"
       - "build-pl (3.10, 2.1, 12.1.0)"
+      - "build-pl (3.10, 2.2, 12.1.0)"
+      - "build-pl (3.11, 2.1, 12.1.0)"
+      - "build-pl (3.11, 2.2, 12.1.0)"
 
   # SECTION: lightning_fabric
 
@@ -165,21 +167,21 @@ subprojects:
       - "!*.md"
       - "!**/*.md"
     checks:
-      - "fabric-cpu (macOS-11, lightning, 3.8, 1.13, oldest)"
-      - "fabric-cpu (macOS-11, lightning, 3.10, 1.13)"
+      - "fabric-cpu (macOS-11, lightning, 3.8, 2.0, oldest)"
+      - "fabric-cpu (macOS-11, lightning, 3.10, 2.0)"
       - "fabric-cpu (macOS-11, lightning, 3.11, 2.1)"
       - "fabric-cpu (macOS-11, lightning, 3.11, 2.2)"
-      - "fabric-cpu (ubuntu-20.04, lightning, 3.8, 1.13, oldest)"
-      - "fabric-cpu (ubuntu-20.04, lightning, 3.10, 1.13)"
+      - "fabric-cpu (ubuntu-20.04, lightning, 3.8, 2.0, oldest)"
+      - "fabric-cpu (ubuntu-20.04, lightning, 3.10, 2.0)"
       - "fabric-cpu (ubuntu-20.04, lightning, 3.11, 2.1)"
       - "fabric-cpu (ubuntu-20.04, lightning, 3.11, 2.2)"
-      - "fabric-cpu (windows-2022, lightning, 3.8, 1.13, oldest)"
-      - "fabric-cpu (windows-2022, lightning, 3.10, 1.13)"
+      - "fabric-cpu (windows-2022, lightning, 3.8, 2.0, oldest)"
+      - "fabric-cpu (windows-2022, lightning, 3.10, 2.0)"
       - "fabric-cpu (windows-2022, lightning, 3.11, 2.1)"
       - "fabric-cpu (windows-2022, lightning, 3.11, 2.2)"
-      - "fabric-cpu (macOS-11, fabric, 3.8, 1.13)"
-      - "fabric-cpu (ubuntu-20.04, fabric, 3.8, 1.13)"
-      - "fabric-cpu (windows-2022, fabric, 3.8, 1.13)"
+      - "fabric-cpu (macOS-11, fabric, 3.8, 2.0)"
+      - "fabric-cpu (ubuntu-20.04, fabric, 3.8, 2.0)"
+      - "fabric-cpu (windows-2022, fabric, 3.8, 2.0)"
       - "fabric-cpu (macOS-12, fabric, 3.11, 2.0)"
       - "fabric-cpu (macOS-12, fabric, 3.11, 2.1)"
       - "fabric-cpu (ubuntu-22.04, fabric, 3.11, 2.0)"
diff --git a/.github/workflows/ci-tests-fabric.yml b/.github/workflows/ci-tests-fabric.yml
index ccb6fc928a014..61c60889a5aa0 100644
--- a/.github/workflows/ci-tests-fabric.yml
+++ b/.github/workflows/ci-tests-fabric.yml
@@ -39,9 +39,9 @@ jobs:
       fail-fast: false
       matrix:
         include:
-          - { os: "macOS-11", pkg-name: "lightning", python-version: "3.10", pytorch-version: "1.13" }
-          - { os: "ubuntu-20.04", pkg-name: "lightning", python-version: "3.10", pytorch-version: "1.13" }
-          - { os: "windows-2022", pkg-name: "lightning", python-version: "3.10", pytorch-version: "1.13" }
+          - { os: "macOS-11", pkg-name: "lightning", python-version: "3.10", pytorch-version: "2.0" }
+          - { os: "ubuntu-20.04", pkg-name: "lightning", python-version: "3.10", pytorch-version: "2.0" }
+          - { os: "windows-2022", pkg-name: "lightning", python-version: "3.10", pytorch-version: "2.0" }
           # only run PyTorch latest
           - { os: "macOS-11", pkg-name: "lightning", python-version: "3.11", pytorch-version: "2.1" }
           - { os: "ubuntu-20.04", pkg-name: "lightning", python-version: "3.11", pytorch-version: "2.1" }
@@ -57,31 +57,25 @@ jobs:
           - { os: "ubuntu-22.04", pkg-name: "fabric", python-version: "3.11", pytorch-version: "2.1" }
           - { os: "windows-2022", pkg-name: "fabric", python-version: "3.11", pytorch-version: "2.1" }
           # "oldest" versions tests, only on minimum Python
-          - {
-              os: "macOS-11",
-              pkg-name: "lightning",
-              python-version: "3.8",
-              pytorch-version: "1.13",
-              requires: "oldest",
-            }
+          - { os: "macOS-11", pkg-name: "lightning", python-version: "3.8", pytorch-version: "2.0", requires: "oldest" }
           - {
               os: "ubuntu-20.04",
               pkg-name: "lightning",
               python-version: "3.8",
-              pytorch-version: "1.13",
+              pytorch-version: "2.0",
               requires: "oldest",
             }
           - {
               os: "windows-2022",
               pkg-name: "lightning",
               python-version: "3.8",
-              pytorch-version: "1.13",
+              pytorch-version: "2.0",
               requires: "oldest",
             }
           # "fabric" installs the standalone package
-          - { os: "macOS-11", pkg-name: "fabric", python-version: "3.8", pytorch-version: "1.13" }
-          - { os: "ubuntu-20.04", pkg-name: "fabric", python-version: "3.8", pytorch-version: "1.13" }
-          - { os: "windows-2022", pkg-name: "fabric", python-version: "3.8", pytorch-version: "1.13" }
+          - { os: "macOS-11", pkg-name: "fabric", python-version: "3.8", pytorch-version: "2.0" }
+          - { os: "ubuntu-20.04", pkg-name: "fabric", python-version: "3.8", pytorch-version: "2.0" }
+          - { os: "windows-2022", pkg-name: "fabric", python-version: "3.8", pytorch-version: "2.0" }
     timeout-minutes: 25 # because of building grpcio on Mac
     env:
       PACKAGE_NAME: ${{ matrix.pkg-name }}
diff --git a/.github/workflows/ci-tests-pytorch.yml b/.github/workflows/ci-tests-pytorch.yml
index 1e5835054fdfe..b7f5b14baf255 100644
--- a/.github/workflows/ci-tests-pytorch.yml
+++ b/.github/workflows/ci-tests-pytorch.yml
@@ -43,9 +43,9 @@ jobs:
       fail-fast: false
       matrix:
         include:
-          - { os: "macOS-11", pkg-name: "lightning", python-version: "3.10", pytorch-version: "1.13" }
-          - { os: "ubuntu-20.04", pkg-name: "lightning", python-version: "3.10", pytorch-version: "1.13" }
-          - { os: "windows-2022", pkg-name: "lightning", python-version: "3.10", pytorch-version: "1.13" }
+          - { os: "macOS-11", pkg-name: "lightning", python-version: "3.10", pytorch-version: "2.0" }
+          - { os: "ubuntu-20.04", pkg-name: "lightning", python-version: "3.10", pytorch-version: "2.0" }
+          - { os: "windows-2022", pkg-name: "lightning", python-version: "3.10", pytorch-version: "2.0" }
           # only run PyTorch latest
           - { os: "macOS-11", pkg-name: "lightning", python-version: "3.10", pytorch-version: "2.1" }
           - { os: "ubuntu-20.04", pkg-name: "lightning", python-version: "3.10", pytorch-version: "2.1" }
@@ -61,31 +61,25 @@ jobs:
           - { os: "ubuntu-22.04", pkg-name: "pytorch", python-version: "3.11", pytorch-version: "2.1" }
           - { os: "windows-2022", pkg-name: "pytorch", python-version: "3.11", pytorch-version: "2.1" }
           # "oldest" versions tests, only on minimum Python
-          - {
-              os: "macOS-11",
-              pkg-name: "lightning",
-              python-version: "3.8",
-              pytorch-version: "1.13",
-              requires: "oldest",
-            }
+          - { os: "macOS-11", pkg-name: "lightning", python-version: "3.8", pytorch-version: "2.0", requires: "oldest" }
           - {
               os: "ubuntu-20.04",
               pkg-name: "lightning",
               python-version: "3.8",
-              pytorch-version: "1.13",
+              pytorch-version: "2.0",
               requires: "oldest",
             }
           - {
               os: "windows-2022",
               pkg-name: "lightning",
               python-version: "3.8",
-              pytorch-version: "1.13",
+              pytorch-version: "2.0",
               requires: "oldest",
             }
           # "pytorch" installs the standalone package
-          - { os: "macOS-11", pkg-name: "pytorch", python-version: "3.8", pytorch-version: "1.13" }
-          - { os: "ubuntu-20.04", pkg-name: "pytorch", python-version: "3.8", pytorch-version: "1.13" }
-          - { os: "windows-2022", pkg-name: "pytorch", python-version: "3.8", pytorch-version: "1.13" }
+          - { os: "macOS-11", pkg-name: "pytorch", python-version: "3.8", pytorch-version: "2.0" }
+          - { os: "ubuntu-20.04", pkg-name: "pytorch", python-version: "3.8", pytorch-version: "2.0" }
+          - { os: "windows-2022", pkg-name: "pytorch", python-version: "3.8", pytorch-version: "2.0" }
     timeout-minutes: 50
     env:
       PACKAGE_NAME: ${{ matrix.pkg-name }}
diff --git a/.github/workflows/docker-build.yml b/.github/workflows/docker-build.yml
index 8005d3386ba5e..7ea9f824bb6b1 100644
--- a/.github/workflows/docker-build.yml
+++ b/.github/workflows/docker-build.yml
@@ -43,10 +43,11 @@ jobs:
         include:
           # We only release one docker image per PyTorch version.
           # Make sure the matrix here matches the one below.
-          - { python_version: "3.9", pytorch_version: "1.13", cuda_version: "11.8.0" }
-          - { python_version: "3.9", pytorch_version: "1.13", cuda_version: "12.0.1" }
           - { python_version: "3.10", pytorch_version: "2.0", cuda_version: "11.8.0" }
           - { python_version: "3.10", pytorch_version: "2.1", cuda_version: "12.1.0" }
+          - { python_version: "3.10", pytorch_version: "2.2", cuda_version: "12.1.0" }
+          - { python_version: "3.11", pytorch_version: "2.1", cuda_version: "12.1.0" }
+          - { python_version: "3.11", pytorch_version: "2.2", cuda_version: "12.1.0" }
     steps:
       - uses: actions/checkout@v4
         with:
@@ -103,8 +104,6 @@ jobs:
         include:
           # These are the base images for PL release docker images.
           # Make sure the matrix here matches the one above.
-          - { python_version: "3.9", pytorch_version: "1.13", cuda_version: "11.8.0" }
-          - { python_version: "3.9", pytorch_version: "1.13", cuda_version: "12.0.1" }
           - { python_version: "3.10", pytorch_version: "2.0", cuda_version: "11.8.0" }
           - { python_version: "3.10", pytorch_version: "2.1", cuda_version: "12.1.0" }
           - { python_version: "3.10", pytorch_version: "2.2", cuda_version: "12.1.0" }
diff --git a/dockers/release/Dockerfile b/dockers/release/Dockerfile
index 5001929b14798..6f8b884857b65 100644
--- a/dockers/release/Dockerfile
+++ b/dockers/release/Dockerfile
@@ -12,9 +12,9 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-ARG PYTHON_VERSION=3.9
-ARG PYTORCH_VERSION=1.13
-ARG CUDA_VERSION=11.3.1
+ARG PYTHON_VERSION=3.10
+ARG PYTORCH_VERSION=2.0
+ARG CUDA_VERSION=11.8.0
 
 FROM pytorchlightning/pytorch_lightning:base-cuda-py${PYTHON_VERSION}-torch${PYTORCH_VERSION}-cuda${CUDA_VERSION}
 
diff --git a/requirements/fabric/base.txt b/requirements/fabric/base.txt
index c57d24a49e583..3a6cdbacd302f 100644
--- a/requirements/fabric/base.txt
+++ b/requirements/fabric/base.txt
@@ -2,7 +2,7 @@
 #  in case you want to preserve/enforce restrictions on the latest compatible version, add "strict" as an in-line comment
 
 numpy >=1.17.2, <1.27.0
-torch >=1.13.0, <2.3.0
+torch >=2.0.0, <2.3.0
 fsspec[http] >=2022.5.0, <2023.11.0
 packaging >=20.0, <=23.1
 typing-extensions >=4.4.0, <4.10.0
diff --git a/requirements/fabric/examples.txt b/requirements/fabric/examples.txt
index e077065766b76..d0be7e3af8496 100644
--- a/requirements/fabric/examples.txt
+++ b/requirements/fabric/examples.txt
@@ -1,6 +1,6 @@
 # NOTE: the upper bound for the package version is only set for CI stability, and it is dropped while installing this package
 #  in case you want to preserve/enforce restrictions on the latest compatible version, add "strict" as an in-line comment
 
-torchvision >=0.14.0, <0.18.0
+torchvision >=0.15.0, <0.18.0
 torchmetrics >=0.10.0, <1.3.0
 lightning-utilities >=0.8.0, <0.12.0
diff --git a/requirements/fabric/strategies.txt b/requirements/fabric/strategies.txt
index 6c302f21269e3..4aee89d9f68e7 100644
--- a/requirements/fabric/strategies.txt
+++ b/requirements/fabric/strategies.txt
@@ -5,5 +5,5 @@
 
 # note: is a bug around 0.10 with `MPS_Accelerator must implement all abstract methods`
 #  shall be resolved by https://github.com/microsoft/DeepSpeed/issues/4372
-deepspeed >=0.8.2, <=0.9.3; platform_system != "Windows"  # strict
+deepspeed >=0.8.2, <=0.9.3; platform_system != "Windows" and platform_system != "Darwin"  # strict
 bitsandbytes >=0.42.0,<0.43.0
diff --git a/requirements/pytorch/base.txt b/requirements/pytorch/base.txt
index ed4250bb3832b..3578917e2cdf0 100644
--- a/requirements/pytorch/base.txt
+++ b/requirements/pytorch/base.txt
@@ -2,7 +2,7 @@
 #  in case you want to preserve/enforce restrictions on the latest compatible version, add "strict" as an in-line comment
 
 numpy >=1.17.2, <1.27.0
-torch >=1.13.0, <2.3.0
+torch >=2.0.0, <2.3.0
 tqdm >=4.57.0, <4.67.0
 PyYAML >=5.4, <6.1.0
 fsspec[http] >=2022.5.0, <2023.11.0
diff --git a/requirements/pytorch/examples.txt b/requirements/pytorch/examples.txt
index 56b7971eb61b0..716b033def533 100644
--- a/requirements/pytorch/examples.txt
+++ b/requirements/pytorch/examples.txt
@@ -2,7 +2,7 @@
 #  in case you want to preserve/enforce restrictions on the latest compatible version, add "strict" as an in-line comment
 
 requests <2.32.0
-torchvision >=0.14.0, <0.18.0
+torchvision >=0.15.0, <0.18.0
 gym[classic_control] >=0.17.0, <0.27.0
 ipython[all] <8.15.0
 torchmetrics >=0.10.0, <1.3.0
diff --git a/requirements/pytorch/strategies.txt b/requirements/pytorch/strategies.txt
index 751ca213d3b53..8d3af408a98fe 100644
--- a/requirements/pytorch/strategies.txt
+++ b/requirements/pytorch/strategies.txt
@@ -3,4 +3,4 @@
 
 # note: is a bug around 0.10 with `MPS_Accelerator must implement all abstract methods`
 #  shall be resolved by https://github.com/microsoft/DeepSpeed/issues/4372
-deepspeed >=0.8.2, <=0.9.3; platform_system != "Windows"  # strict
+deepspeed >=0.8.2, <=0.9.3; platform_system != "Windows" and platform_system != "Darwin"  # strict
diff --git a/src/lightning/fabric/CHANGELOG.md b/src/lightning/fabric/CHANGELOG.md
index d53529d391418..154433a1c101d 100644
--- a/src/lightning/fabric/CHANGELOG.md
+++ b/src/lightning/fabric/CHANGELOG.md
@@ -39,7 +39,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 
 ### Removed
 
--
+- Removed support for PyTorch 1.13 ([#19706](https://github.com/Lightning-AI/lightning/pull/19706))
 
 -
 
diff --git a/src/lightning/fabric/accelerators/cuda.py b/src/lightning/fabric/accelerators/cuda.py
index 8613c6549e4c9..4afc9be723fc2 100644
--- a/src/lightning/fabric/accelerators/cuda.py
+++ b/src/lightning/fabric/accelerators/cuda.py
@@ -11,18 +11,14 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-import os
-import warnings
-from contextlib import contextmanager
 from functools import lru_cache
-from typing import Generator, List, Optional, Union, cast
+from typing import List, Optional, Union
 
 import torch
 from typing_extensions import override
 
 from lightning.fabric.accelerators.accelerator import Accelerator
 from lightning.fabric.accelerators.registry import _AcceleratorRegistry
-from lightning.fabric.utilities.imports import _TORCH_GREATER_EQUAL_2_0
 from lightning.fabric.utilities.rank_zero import rank_zero_info
 
 
@@ -144,211 +140,15 @@ def _get_all_visible_cuda_devices() -> List[int]:
     return list(range(num_cuda_devices()))
 
 
-# TODO: Remove once minimum supported PyTorch version is 2.0
-@contextmanager
-def _patch_cuda_is_available() -> Generator:
-    """Context manager that safely patches :func:`torch.cuda.is_available` with its NVML-based version if possible."""
-    if hasattr(torch._C, "_cuda_getDeviceCount") and _device_count_nvml() >= 0 and not _TORCH_GREATER_EQUAL_2_0:
-        # we can safely patch is_available if both torch has CUDA compiled and the NVML count is succeeding
-        # otherwise, patching is_available could lead to attribute errors or infinite recursion
-        orig_check = torch.cuda.is_available
-        torch.cuda.is_available = is_cuda_available
-        try:
-            yield
-        finally:
-            torch.cuda.is_available = orig_check
-    else:
-        yield
-
-
-@lru_cache(1)
 def num_cuda_devices() -> int:
-    """Returns the number of available CUDA devices.
-
-    Unlike :func:`torch.cuda.device_count`, this function does its best not to create a CUDA context for fork support,
-    if the platform allows it.
-
-    """
-    if _TORCH_GREATER_EQUAL_2_0:
-        return torch.cuda.device_count()
-
-    # Implementation copied from upstream: https://github.com/pytorch/pytorch/pull/84879
-    # TODO: Remove once minimum supported PyTorch version is 2.0
-    nvml_count = _device_count_nvml()
-    return torch.cuda.device_count() if nvml_count < 0 else nvml_count
+    """Returns the number of available CUDA devices."""
+    return torch.cuda.device_count()
 
 
 def is_cuda_available() -> bool:
-    """Returns a bool indicating if CUDA is currently available.
-
-    Unlike :func:`torch.cuda.is_available`, this function does its best not to create a CUDA context for fork support,
-    if the platform allows it.
-
-    """
+    """Returns a bool indicating if CUDA is currently available."""
     # We set `PYTORCH_NVML_BASED_CUDA_CHECK=1` in lightning.fabric.__init__.py
-    return torch.cuda.is_available() if _TORCH_GREATER_EQUAL_2_0 else num_cuda_devices() > 0
-
-
-# TODO: Remove once minimum supported PyTorch version is 2.0
-def _parse_visible_devices() -> Union[List[int], List[str]]:
-    """Parse CUDA_VISIBLE_DEVICES environment variable."""
-    var = os.getenv("CUDA_VISIBLE_DEVICES")
-    if var is None:
-        return list(range(64))
-
-    def _strtoul(s: str) -> int:
-        """Return -1 or positive integer sequence string starts with,"""
-        if not s:
-            return -1
-        for idx, c in enumerate(s):
-            if not (c.isdigit() or (idx == 0 and c in "+-")):
-                break
-            if idx + 1 == len(s):
-                idx += 1
-        return int(s[:idx]) if idx > 0 else -1
-
-    def parse_list_with_prefix(lst: str, prefix: str) -> List[str]:
-        rcs: List[str] = []
-        for elem in lst.split(","):
-            # Repeated id results in empty set
-            if elem in rcs:
-                return cast(List[str], [])
-            # Anything other but prefix is ignored
-            if not elem.startswith(prefix):
-                break
-            rcs.append(elem)
-        return rcs
-
-    if var.startswith("GPU-"):
-        return parse_list_with_prefix(var, "GPU-")
-    if var.startswith("MIG-"):
-        return parse_list_with_prefix(var, "MIG-")
-    # CUDA_VISIBLE_DEVICES uses something like strtoul
-    # which makes `1gpu2,2ampere` is equivalent to `1,2`
-    rc: List[int] = []
-    for elem in var.split(","):
-        x = _strtoul(elem.strip())
-        # Repeated ordinal results in empty set
-        if x in rc:
-            return cast(List[int], [])
-        # Negative value aborts the sequence
-        if x < 0:
-            break
-        rc.append(x)
-    return rc
-
-
-# TODO: Remove once minimum supported PyTorch version is 2.0
-def _raw_device_count_nvml() -> int:
-    """Return number of devices as reported by NVML or negative value if NVML discovery/initialization failed."""
-    from ctypes import CDLL, byref, c_int
-
-    nvml_h = CDLL("libnvidia-ml.so.1")
-    rc = nvml_h.nvmlInit()
-    if rc != 0:
-        warnings.warn("Can't initialize NVML")
-        return -1
-    dev_count = c_int(-1)
-    rc = nvml_h.nvmlDeviceGetCount_v2(byref(dev_count))
-    if rc != 0:
-        warnings.warn("Can't get nvml device count")
-        return -1
-    del nvml_h
-    return dev_count.value
-
-
-# TODO: Remove once minimum supported PyTorch version is 2.0
-def _raw_device_uuid_nvml() -> Optional[List[str]]:
-    """Return list of device UUID as reported by NVML or None if NVM discovery/initialization failed."""
-    from ctypes import CDLL, byref, c_int, c_void_p, create_string_buffer
-
-    nvml_h = CDLL("libnvidia-ml.so.1")
-    rc = nvml_h.nvmlInit()
-    if rc != 0:
-        warnings.warn("Can't initialize NVML")
-        return None
-    dev_count = c_int(-1)
-    rc = nvml_h.nvmlDeviceGetCount_v2(byref(dev_count))
-    if rc != 0:
-        warnings.warn("Can't get nvml device count")
-        return None
-    uuids: List[str] = []
-    for idx in range(dev_count.value):
-        dev_id = c_void_p()
-        rc = nvml_h.nvmlDeviceGetHandleByIndex_v2(idx, byref(dev_id))
-        if rc != 0:
-            warnings.warn("Can't get device handle")
-            return None
-        buf_len = 96
-        buf = create_string_buffer(buf_len)
-        rc = nvml_h.nvmlDeviceGetUUID(dev_id, buf, buf_len)
-        if rc != 0:
-            warnings.warn("Can't get device UUID")
-            return None
-        uuids.append(buf.raw.decode("ascii").strip("\0"))
-    del nvml_h
-    return uuids
-
-
-# TODO: Remove once minimum supported PyTorch version is 2.0
-def _transform_uuid_to_ordinals(candidates: List[str], uuids: List[str]) -> List[int]:
-    """Given the set of partial uuids and list of known uuids builds a set of ordinals excluding ambiguous partials
-    IDs."""
-
-    def uuid_to_orinal(candidate: str, uuids: List[str]) -> int:
-        best_match = -1
-        for idx, uuid in enumerate(uuids):
-            if not uuid.startswith(candidate):
-                continue
-            # Ambigous candidate
-            if best_match != -1:
-                return -1
-            best_match = idx
-        return best_match
-
-    rc: List[int] = []
-    for candidate in candidates:
-        idx = uuid_to_orinal(candidate, uuids)
-        # First invalid ordinal stops parsing
-        if idx < 0:
-            break
-        # Duplicates result in empty set
-        if idx in rc:
-            return cast(List[int], [])
-        rc.append(idx)
-    return rc
-
-
-# TODO: Remove once minimum supported PyTorch version is 2.0
-def _device_count_nvml() -> int:
-    """Return number of devices as reported by NVML taking CUDA_VISIBLE_DEVICES into account.
-
-    Negative value is returned if NVML discovery or initialization has failed.
-
-    """
-    visible_devices = _parse_visible_devices()
-    if not visible_devices:
-        return 0
-    try:
-        if isinstance(visible_devices[0], str):
-            # Skip MIG parsing
-            if visible_devices[0].startswith("MIG-"):
-                return -1
-            uuids = _raw_device_uuid_nvml()
-            if uuids is None:
-                return -1
-            visible_devices = _transform_uuid_to_ordinals(cast(List[str], visible_devices), uuids)
-        else:
-            raw_cnt = _raw_device_count_nvml()
-            if raw_cnt <= 0:
-                return raw_cnt
-            # Trim the list up to a maximum available device
-            for idx, val in enumerate(visible_devices):
-                if cast(int, val) >= raw_cnt:
-                    return idx
-    except (OSError, AttributeError):
-        return -1
-    return len(visible_devices)
+    return torch.cuda.is_available()
 
 
 def _is_ampere_or_later(device: Optional[torch.device] = None) -> bool:
@@ -375,7 +175,7 @@ def _check_cuda_matmul_precision(device: torch.device) -> None:
 
 def _clear_cuda_memory() -> None:
     # strangely, the attribute function be undefined when torch.compile is used
-    if _TORCH_GREATER_EQUAL_2_0 and hasattr(torch._C, "_cuda_clearCublasWorkspaces"):
+    if hasattr(torch._C, "_cuda_clearCublasWorkspaces"):
         # https://github.com/pytorch/pytorch/issues/95668
         torch._C._cuda_clearCublasWorkspaces()
     torch.cuda.empty_cache()
diff --git a/src/lightning/fabric/fabric.py b/src/lightning/fabric/fabric.py
index 4b9c14eb06e62..aa67d2e7ce9ac 100644
--- a/src/lightning/fabric/fabric.py
+++ b/src/lightning/fabric/fabric.py
@@ -51,7 +51,6 @@
     FSDPStrategy,
     SingleDeviceStrategy,
     Strategy,
-    XLAFSDPStrategy,
     XLAStrategy,
 )
 from lightning.fabric.strategies.fsdp import _has_meta_device_parameters
@@ -67,7 +66,6 @@
 )
 from lightning.fabric.utilities.device_dtype_mixin import _update_properties
 from lightning.fabric.utilities.distributed import DistributedSamplerWrapper, _InfiniteBarrier
-from lightning.fabric.utilities.imports import _TORCH_GREATER_EQUAL_2_0
 from lightning.fabric.utilities.rank_zero import rank_zero_deprecation, rank_zero_warn
 from lightning.fabric.utilities.registry import _load_external_callbacks
 from lightning.fabric.utilities.seed import seed_everything
@@ -699,26 +697,14 @@ def sharded_model(self) -> ContextManager:
 
     def init_tensor(self) -> ContextManager:
         """Tensors that you instantiate under this context manager will be created on the device right away and have
-        the right data type depending on the precision setting in Fabric.
-
-        The automatic device placement under this context manager is only supported with PyTorch 2.0 and newer.
-
-        """
-        if not _TORCH_GREATER_EQUAL_2_0 and self.device.type != "cpu":
-            rank_zero_warn(
-                "`Fabric.init_tensor()` can't place tensors on the device directly"
-                " with PyTorch < 2.0. Parameters will remain on CPU until `Fabric.setup()` is called."
-                " Upgrade to PyTorch >= 2.0 to fully utilize this feature.",
-                category=PossibleUserWarning,
-            )
+        the right data type depending on the precision setting in Fabric."""
         return self._strategy.tensor_init_context()
 
     def init_module(self, empty_init: Optional[bool] = None) -> ContextManager:
         """Instantiate the model and its parameters under this context manager to reduce peak memory usage.
 
         The parameters get created on the device and with the right data type right away without wasting memory being
-        allocated unnecessarily. The automatic device placement under this context manager is only supported with
-        PyTorch 2.0 and newer.
+        allocated unnecessarily.
 
         Args:
             empty_init: Whether to initialize the model with empty weights (uninitialized memory).
@@ -727,13 +713,6 @@ def init_module(self, empty_init: Optional[bool] = None) -> ContextManager:
 
         """
         self._validate_launched()
-        if not _TORCH_GREATER_EQUAL_2_0 and self.device.type != "cpu":
-            rank_zero_warn(
-                "`Fabric.init_module()` can't place the model parameters on the device directly"
-                " with PyTorch < 2.0. Parameters will remain on CPU until `Fabric.setup()` is called."
-                " Upgrade to PyTorch >= 2.0 to fully utilize this feature.",
-                category=PossibleUserWarning,
-            )
         return self._strategy.module_init_context(empty_init=empty_init)
 
     def save(
@@ -1036,12 +1015,6 @@ def _validate_setup(self, module: nn.Module, optimizers: Sequence[Optimizer]) ->
         if any(isinstance(opt, _FabricOptimizer) for opt in optimizers):
             raise ValueError("An optimizer should be passed only once to the `setup` method.")
 
-        if isinstance(self._strategy, (FSDPStrategy, XLAFSDPStrategy)) and not _TORCH_GREATER_EQUAL_2_0:
-            raise RuntimeError(
-                f"The `{type(self).__name__}` requires the model and optimizer(s) to be set up separately."
-                " Create and set up the model first through `model = self.setup_module(model)`. Then create the"
-                " optimizer and set it up: `optimizer = self.setup_optimizer(optimizer)`."
-            )
         if isinstance(self._strategy, FSDPStrategy) and any(
             _has_meta_device_parameters(optimizer) for optimizer in optimizers
         ):
diff --git a/src/lightning/fabric/plugins/precision/amp.py b/src/lightning/fabric/plugins/precision/amp.py
index 0ec21247c9881..75d7932ddb916 100644
--- a/src/lightning/fabric/plugins/precision/amp.py
+++ b/src/lightning/fabric/plugins/precision/amp.py
@@ -20,7 +20,6 @@
 from torch.optim import LBFGS, Optimizer
 from typing_extensions import override
 
-from lightning.fabric.accelerators.cuda import _patch_cuda_is_available
 from lightning.fabric.plugins.precision.precision import Precision
 from lightning.fabric.plugins.precision.utils import _convert_fp_tensor
 from lightning.fabric.utilities.types import Optimizable
@@ -50,9 +49,7 @@ def __init__(
 
         self.precision = precision
         if scaler is None and self.precision == "16-mixed":
-            with _patch_cuda_is_available():
-                # if possible, we defer CUDA initialization to support strategies that will attempt forks
-                scaler = torch.cuda.amp.GradScaler()
+            scaler = torch.cuda.amp.GradScaler()
         if scaler is not None and self.precision == "bf16-mixed":
             raise ValueError(f"`precision='bf16-mixed'` does not use a scaler, found {scaler}.")
         self.device = device
diff --git a/src/lightning/fabric/plugins/precision/fsdp.py b/src/lightning/fabric/plugins/precision/fsdp.py
index 161ad98f43475..179fc21cdd90d 100644
--- a/src/lightning/fabric/plugins/precision/fsdp.py
+++ b/src/lightning/fabric/plugins/precision/fsdp.py
@@ -23,7 +23,6 @@
 from lightning.fabric.plugins.precision.amp import _optimizer_handles_unscaling
 from lightning.fabric.plugins.precision.precision import Precision
 from lightning.fabric.plugins.precision.utils import _convert_fp_tensor, _DtypeContextManager
-from lightning.fabric.utilities.imports import _TORCH_GREATER_EQUAL_2_0
 from lightning.fabric.utilities.types import Optimizable
 
 if TYPE_CHECKING:
@@ -78,21 +77,18 @@ def __init__(self, precision: _PRECISION_INPUT, scaler: Optional["ShardedGradSca
     def mixed_precision_config(self) -> "TorchMixedPrecision":
         from torch.distributed.fsdp.fully_sharded_data_parallel import MixedPrecision as TorchMixedPrecision
 
-        # With PyTorch < 2.0, FSDP uses the noneness of `param_dtype` as a proxy for the `_uses_param_mixed_precision`
-        # property. In order to avoid FSDP assertion failures, we therefore avoid setting `param_dtype` to
-        # `torch.float32` here with PyTorch < 2.0.
         if self.precision == "16-mixed":
-            param_dtype = None if not _TORCH_GREATER_EQUAL_2_0 else torch.float32
+            param_dtype = torch.float32
             reduce_dtype = buffer_dtype = torch.float16
         elif self.precision == "bf16-mixed":
-            param_dtype = None if not _TORCH_GREATER_EQUAL_2_0 else torch.float32
+            param_dtype = torch.float32
             reduce_dtype = buffer_dtype = torch.bfloat16
         elif self.precision == "16-true":
             param_dtype = reduce_dtype = buffer_dtype = torch.float16
         elif self.precision == "bf16-true":
             param_dtype = reduce_dtype = buffer_dtype = torch.bfloat16
         elif self.precision == "32-true":
-            param_dtype = None if not _TORCH_GREATER_EQUAL_2_0 else torch.float32
+            param_dtype = torch.float32
             reduce_dtype = buffer_dtype = torch.float32
         else:
             raise ValueError(f"Was unable to infer precision type, received {self.precision!r}.")
diff --git a/src/lightning/fabric/strategies/fsdp.py b/src/lightning/fabric/strategies/fsdp.py
index ed89629f720e8..30251a9315cd4 100644
--- a/src/lightning/fabric/strategies/fsdp.py
+++ b/src/lightning/fabric/strategies/fsdp.py
@@ -63,7 +63,6 @@
 )
 from lightning.fabric.utilities.distributed import group as _group
 from lightning.fabric.utilities.imports import (
-    _TORCH_GREATER_EQUAL_2_0,
     _TORCH_GREATER_EQUAL_2_1,
     _TORCH_GREATER_EQUAL_2_2,
     _TORCH_GREATER_EQUAL_2_3,
@@ -76,14 +75,9 @@
 
 if TYPE_CHECKING:
     from torch.distributed.fsdp.fully_sharded_data_parallel import CPUOffload, MixedPrecision, ShardingStrategy
+    from torch.distributed.fsdp.wrap import ModuleWrapPolicy
 
-    if _TORCH_GREATER_EQUAL_2_0:
-        from torch.distributed.fsdp.wrap import ModuleWrapPolicy
-
-        _POLICY = Union[Set[Type[Module]], Callable[[Module, bool, int], bool], ModuleWrapPolicy]
-    else:
-        _POLICY = Union[Set[Type[Module]], Callable[[Module, bool, int], bool]]  # type: ignore[misc]
-
+    _POLICY = Union[Set[Type[Module]], Callable[[Module, bool, int], bool], ModuleWrapPolicy]
     _SHARDING_STRATEGY = Union[ShardingStrategy, Literal["FULL_SHARD", "SHARD_GRAD_OP", "NO_SHARD", "HYBRID_SHARD"]]
 
 _FSDP_ALIASES = ("fsdp", "fsdp_cpu_offload")
@@ -168,9 +162,8 @@ def __init__(
         self._backward_sync_control = _FSDPBackwardSyncControl()
         self._fsdp_kwargs = _auto_wrap_policy_kwargs(auto_wrap_policy, kwargs)
 
-        if _TORCH_GREATER_EQUAL_2_0:
-            # Enables joint setup of model and optimizer, multiple optimizer param groups, and `torch.compile()`
-            self._fsdp_kwargs.setdefault("use_orig_params", True)
+        # Enables joint setup of model and optimizer, multiple optimizer param groups, and `torch.compile()`
+        self._fsdp_kwargs.setdefault("use_orig_params", True)
 
         self._activation_checkpointing_kwargs = _activation_checkpointing_kwargs(
             activation_checkpointing, activation_checkpointing_policy
@@ -259,12 +252,6 @@ def setup_module_and_optimizers(
     ) -> Tuple[Module, List[Optimizer]]:
         """Wraps the model into a :class:`~torch.distributed.fsdp.fully_sharded_data_parallel.FullyShardedDataParallel`
         module and sets `use_orig_params=True` to keep the reference to the original parameters in the optimizer."""
-        if not _TORCH_GREATER_EQUAL_2_0:
-            raise NotImplementedError(
-                f"The `{type(self).__name__}` does not support the joint setup of module and optimizer(s)."
-                " Please do it in this order: Create the model, call `setup_module`, create the optimizer,"
-                " call `setup_optimizer`."
-            )
         use_orig_params = self._fsdp_kwargs.get("use_orig_params")
         if use_orig_params is False:
             raise ValueError(
@@ -428,11 +415,6 @@ def save_checkpoint(
         creates a metadata file `meta.pt` with the rest of the user's state (only saved from rank 0).
 
         """
-        if not _TORCH_GREATER_EQUAL_2_0:
-            raise NotImplementedError(
-                "Saving and loading checkpoints with the `FSDPStrategy` is not supported in PyTorch < 2.0."
-                " Please upgrade `torch` or file an issue: `https://github.com/Lightning-AI/lightning/issues`."
-            )
         if storage_options is not None:
             raise TypeError(
                 "`FSDPStrategy.save_checkpoint(..., storage_options=...)` is not supported because"
@@ -530,11 +512,6 @@ def load_checkpoint(
         directory of multiple files rather than a single file.
 
         """
-        if not _TORCH_GREATER_EQUAL_2_0:
-            raise NotImplementedError(
-                "Saving and loading checkpoints with the `FSDPStrategy` is not supported in PyTorch < 2.0."
-                " Please upgrade `torch` or file an issue: `https://github.com/Lightning-AI/lightning/issues`."
-            )
         if not state:
             raise ValueError(
                 f"Got FSDPStrategy.load_checkpoint(..., state={state!r}) but a state with at least "
@@ -614,16 +591,15 @@ def load_checkpoint(
             return metadata
 
         if _is_full_checkpoint(path):
-            checkpoint = _lazy_load(path) if _TORCH_GREATER_EQUAL_2_0 else torch.load(path, map_location="cpu")
+            checkpoint = _lazy_load(path)
             _load_raw_module_state(checkpoint.pop(module_key), module=module, world_size=self.world_size, strict=strict)
 
             if isinstance(state, Module):
                 return {}
 
-            if _TORCH_GREATER_EQUAL_2_0:
-                # Materialize lazy tensors if there are any left in the checkpoint
-                # The `torch.Optimizer.load_state_dict` method can't load lazy tensors because of deepcopy pickle issues
-                checkpoint = _materialize_tensors(checkpoint)
+            # Materialize lazy tensors if there are any left in the checkpoint
+            # The `torch.Optimizer.load_state_dict` method can't load lazy tensors because of deepcopy pickle issues
+            checkpoint = _materialize_tensors(checkpoint)
 
             # Load optimizer states
             for optim_key, optim in optimizers.items():
@@ -840,27 +816,20 @@ def _get_full_state_dict_context(
 ) -> Generator[None, None, None]:
     from torch.distributed.fsdp import FullStateDictConfig, StateDictType
     from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
+    from torch.distributed.fsdp.api import FullOptimStateDictConfig
 
-    # In PyTorch <= 2.0, offload to CPU in combination with `world_size=1` is not possible
+    # In PyTorch < 2.1, offload to CPU in combination with `world_size=1` is not possible
     offload_to_cpu = world_size > 1 or _TORCH_GREATER_EQUAL_2_1
     state_dict_config = FullStateDictConfig(offload_to_cpu=offload_to_cpu, rank0_only=rank0_only)
 
-    if _TORCH_GREATER_EQUAL_2_0:
-        from torch.distributed.fsdp.api import FullOptimStateDictConfig
+    optim_state_dict_config = FullOptimStateDictConfig(offload_to_cpu=offload_to_cpu, rank0_only=rank0_only)
+    state_dict_type_context = FSDP.state_dict_type(
+        module=module,
+        state_dict_type=StateDictType.FULL_STATE_DICT,
+        state_dict_config=state_dict_config,
+        optim_state_dict_config=optim_state_dict_config,
+    )
 
-        optim_state_dict_config = FullOptimStateDictConfig(offload_to_cpu=offload_to_cpu, rank0_only=rank0_only)
-        state_dict_type_context = FSDP.state_dict_type(
-            module=module,
-            state_dict_type=StateDictType.FULL_STATE_DICT,
-            state_dict_config=state_dict_config,
-            optim_state_dict_config=optim_state_dict_config,
-        )
-    else:
-        state_dict_type_context = FSDP.state_dict_type(
-            module=module,
-            state_dict_type=StateDictType.FULL_STATE_DICT,
-            state_dict_config=state_dict_config,
-        )
     return state_dict_type_context  # type: ignore[return-value]
 
 
diff --git a/src/lightning/fabric/strategies/strategy.py b/src/lightning/fabric/strategies/strategy.py
index 1c64f97394fa2..6bfed6a270b68 100644
--- a/src/lightning/fabric/strategies/strategy.py
+++ b/src/lightning/fabric/strategies/strategy.py
@@ -29,7 +29,6 @@
 from lightning.fabric.strategies.launchers.launcher import _Launcher
 from lightning.fabric.strategies.registry import _StrategyRegistry
 from lightning.fabric.utilities.apply_func import move_data_to_device
-from lightning.fabric.utilities.imports import _TORCH_GREATER_EQUAL_2_0
 from lightning.fabric.utilities.init import _EmptyInit
 from lightning.fabric.utilities.types import _PATH, Optimizable, ReduceOp, _Stateful
 
@@ -122,8 +121,7 @@ def tensor_init_context(self) -> ContextManager:
         """Controls how tensors get created (device, dtype)."""
         precision_init_ctx = self.precision.tensor_init_context()
         stack = ExitStack()
-        if _TORCH_GREATER_EQUAL_2_0:
-            stack.enter_context(self.root_device)
+        stack.enter_context(self.root_device)
         stack.enter_context(precision_init_ctx)
         return stack
 
@@ -140,8 +138,7 @@ def module_init_context(self, empty_init: Optional[bool] = None) -> ContextManag
         """
         precision_module_ctx = self.precision.module_init_context()
         stack = ExitStack()
-        if _TORCH_GREATER_EQUAL_2_0:
-            stack.enter_context(self.root_device)
+        stack.enter_context(self.root_device)
         stack.enter_context(_EmptyInit(enabled=bool(empty_init)))
         stack.enter_context(precision_module_ctx)
         return stack
diff --git a/src/lightning/fabric/strategies/xla_fsdp.py b/src/lightning/fabric/strategies/xla_fsdp.py
index 1b53292ff1581..6da693bafb1c8 100644
--- a/src/lightning/fabric/strategies/xla_fsdp.py
+++ b/src/lightning/fabric/strategies/xla_fsdp.py
@@ -39,7 +39,6 @@
     _validate_keys_for_strict_loading,
 )
 from lightning.fabric.utilities.cloud_io import get_filesystem
-from lightning.fabric.utilities.imports import _TORCH_GREATER_EQUAL_2_0
 from lightning.fabric.utilities.init import _EmptyInit
 from lightning.fabric.utilities.rank_zero import rank_zero_only, rank_zero_warn
 from lightning.fabric.utilities.types import _PATH, Optimizable, ReduceOp
@@ -420,11 +419,6 @@ def save_checkpoint(
         consolidated checkpoint combining all of the sharded checkpoints.
 
         """
-        if not _TORCH_GREATER_EQUAL_2_0:
-            raise NotImplementedError(
-                "Saving and loading checkpoints with the `XLAFSDPStrategy` is not supported in PyTorch < 2.0."
-                " Please upgrade `torch`."
-            )
         # broadcast the path from rank 0 to ensure all the states are saved in a common path
         path = Path(self.broadcast(path))
         if path.is_dir() and any(path.iterdir()):
@@ -527,11 +521,6 @@ def load_checkpoint(
         directory of multiple files rather than a single file.
 
         """
-        if not _TORCH_GREATER_EQUAL_2_0:
-            raise NotImplementedError(
-                "Saving and loading checkpoints with the `FSDPStrategy` is not supported in PyTorch < 2.0."
-                " Please upgrade `torch` or file an issue: `https://github.com/Lightning-AI/lightning/issues`."
-            )
         if not state:
             raise ValueError(
                 f"Got `XLAFSDPStrategy.load_checkpoint(..., state={state!r})` but a state with at least "
diff --git a/src/lightning/fabric/utilities/imports.py b/src/lightning/fabric/utilities/imports.py
index cc069a2a73338..bcfeadf3298ca 100644
--- a/src/lightning/fabric/utilities/imports.py
+++ b/src/lightning/fabric/utilities/imports.py
@@ -26,11 +26,10 @@
 # 2. The inspection mode via `python -i`: https://stackoverflow.com/a/6879085/1162383
 _IS_INTERACTIVE = hasattr(sys, "ps1") or bool(sys.flags.interactive)
 
-_TORCH_GREATER_EQUAL_2_0 = compare_version("torch", operator.ge, "2.0.0")
 _TORCH_GREATER_EQUAL_2_1 = compare_version("torch", operator.ge, "2.1.0")
 _TORCH_GREATER_EQUAL_2_2 = compare_version("torch", operator.ge, "2.2.0")
 _TORCH_GREATER_EQUAL_2_3 = compare_version("torch", operator.ge, "2.3.0", use_base_version=True)
-_TORCH_EQUAL_2_0 = _TORCH_GREATER_EQUAL_2_0 and not _TORCH_GREATER_EQUAL_2_1
+_TORCH_EQUAL_2_0 = compare_version("torch", operator.ge, "2.0.0") and not _TORCH_GREATER_EQUAL_2_1
 
 _PYTHON_GREATER_EQUAL_3_8_0 = (sys.version_info.major, sys.version_info.minor) >= (3, 8)
 _PYTHON_GREATER_EQUAL_3_10_0 = (sys.version_info.major, sys.version_info.minor) >= (3, 10)
diff --git a/src/lightning/fabric/utilities/load.py b/src/lightning/fabric/utilities/load.py
index 29ccca9e4375f..9862cc2bd981e 100644
--- a/src/lightning/fabric/utilities/load.py
+++ b/src/lightning/fabric/utilities/load.py
@@ -25,10 +25,7 @@
 from torch.nn import Parameter
 from typing_extensions import override
 
-from lightning.fabric.utilities.imports import (
-    _TORCH_GREATER_EQUAL_2_0,
-    _TORCH_GREATER_EQUAL_2_3,
-)
+from lightning.fabric.utilities.imports import _TORCH_GREATER_EQUAL_2_3
 from lightning.fabric.utilities.types import _PATH, _Stateful
 
 _METADATA_FILENAME = "meta.pt"
@@ -202,8 +199,6 @@ def persistent_load(self, pid: tuple) -> "TypedStorage":
 
 
 def _lazy_load(filename: _PATH) -> Any:
-    if not _TORCH_GREATER_EQUAL_2_0:
-        raise NotImplementedError("Lazy-loading is only supported with PyTorch >= 2.0.")
     if not os.path.isfile(filename):
         raise FileNotFoundError(f"Path {str(filename)!r} does not exist or is not a file.")
     file_reader = torch.PyTorchFileReader(str(filename))
diff --git a/src/lightning/fabric/utilities/testing/_runif.py b/src/lightning/fabric/utilities/testing/_runif.py
index b9bfd1e269d71..6ab2ff730eec9 100644
--- a/src/lightning/fabric/utilities/testing/_runif.py
+++ b/src/lightning/fabric/utilities/testing/_runif.py
@@ -24,7 +24,7 @@
 from lightning.fabric.accelerators.cuda import num_cuda_devices
 from lightning.fabric.accelerators.mps import MPSAccelerator
 from lightning.fabric.strategies.deepspeed import _DEEPSPEED_AVAILABLE
-from lightning.fabric.utilities.imports import _TORCH_GREATER_EQUAL_2_0, _TORCH_GREATER_EQUAL_2_1
+from lightning.fabric.utilities.imports import _TORCH_GREATER_EQUAL_2_1
 
 
 def _runif_reasons(
@@ -122,7 +122,6 @@ def _runif_reasons(
             cond = not is_dynamo_supported()
         else:
             cond = sys.platform == "win32" or sys.version_info >= (3, 11)
-        cond |= not _TORCH_GREATER_EQUAL_2_0
         if cond:
             reasons.append("torch.dynamo")
 
diff --git a/src/lightning/fabric/utilities/types.py b/src/lightning/fabric/utilities/types.py
index c4bc32f3cf319..2e18dc89b05b2 100644
--- a/src/lightning/fabric/utilities/types.py
+++ b/src/lightning/fabric/utilities/types.py
@@ -28,10 +28,10 @@
 
 import torch
 from torch import Tensor
-from torch.optim import Optimizer
-from typing_extensions import TypeAlias, overload
 
-from lightning.fabric.utilities.imports import _TORCH_GREATER_EQUAL_2_0
+# TODO: Unused import, but lightning_habana imports these from here
+from torch.optim.lr_scheduler import LRScheduler, ReduceLROnPlateau  # noqa: F401
+from typing_extensions import TypeAlias, overload
 
 UntypedStorage: TypeAlias = torch.UntypedStorage
 
@@ -42,7 +42,6 @@
 ]
 _PARAMETERS = Iterator[torch.nn.Parameter]
 
-
 if torch.distributed.is_available():
     from torch.distributed import ProcessGroup, ReduceOp
 
@@ -70,49 +69,6 @@ def size(self) -> int: ...
     def rank(self) -> int: ...
 
 
-# Inferred from `torch.optim.lr_scheduler.pyi`
-# Missing attributes were added to improve typing
-@runtime_checkable
-class LRScheduler(_Stateful[str], Protocol):
-    optimizer: Optimizer
-    base_lrs: List[float]
-
-    def __init__(self, optimizer: Optimizer, *args: Any, **kwargs: Any) -> None: ...
-
-    def step(self, epoch: Optional[int] = None) -> None: ...
-
-
-_TORCH_LRSCHEDULER: TypeAlias = (
-    torch.optim.lr_scheduler.LRScheduler  # type: ignore[valid-type]
-    if _TORCH_GREATER_EQUAL_2_0
-    else torch.optim.lr_scheduler._LRScheduler
-)
-
-
-# Inferred from `torch.optim.lr_scheduler.pyi`
-# Missing attributes were added to improve typing
-@runtime_checkable
-class ReduceLROnPlateau(_Stateful[str], Protocol):
-    in_cooldown: bool
-    optimizer: Optimizer
-
-    def __init__(
-        self,
-        optimizer: Optimizer,
-        mode: str = ...,
-        factor: float = ...,
-        patience: int = ...,
-        verbose: bool = ...,
-        threshold: float = ...,
-        threshold_mode: str = ...,
-        cooldown: int = ...,
-        min_lr: float = ...,
-        eps: float = ...,
-    ) -> None: ...
-
-    def step(self, metrics: Union[float, int, Tensor], epoch: Optional[int] = None) -> None: ...
-
-
 @runtime_checkable
 class Steppable(Protocol):
     """To structurally type ``optimizer.step()``"""
diff --git a/src/lightning/fabric/wrappers.py b/src/lightning/fabric/wrappers.py
index 093b355e2c376..f932750e14239 100644
--- a/src/lightning/fabric/wrappers.py
+++ b/src/lightning/fabric/wrappers.py
@@ -15,7 +15,6 @@
 from copy import deepcopy
 from functools import partial, wraps
 from typing import (
-    TYPE_CHECKING,
     Any,
     Callable,
     Dict,
@@ -35,6 +34,7 @@
 from lightning_utilities.core.apply_func import apply_to_collection
 from torch import Tensor
 from torch import nn as nn
+from torch._dynamo import OptimizedModule
 from torch.nn.modules.module import _IncompatibleKeys
 from torch.optim import Optimizer
 from torch.utils.data import DataLoader
@@ -45,12 +45,8 @@
 from lightning.fabric.utilities import move_data_to_device
 from lightning.fabric.utilities.data import _set_sampler_epoch
 from lightning.fabric.utilities.device_dtype_mixin import _DeviceDtypeModuleMixin
-from lightning.fabric.utilities.imports import _TORCH_GREATER_EQUAL_2_0
 from lightning.fabric.utilities.types import Optimizable
 
-if TYPE_CHECKING:
-    from torch._dynamo import OptimizedModule
-
 T_destination = TypeVar("T_destination", bound=Dict[str, Any])
 _LIGHTNING_MODULE_STEP_METHODS = ("training_step", "validation_step", "test_step", "predict_step")
 
@@ -329,26 +325,17 @@ def _unwrap(
         return obj
 
     types = [_FabricModule, _FabricOptimizer, _FabricDataLoader]
-    if _TORCH_GREATER_EQUAL_2_0:
-        from torch._dynamo import OptimizedModule
-
-        types.append(OptimizedModule)
+    types.append(OptimizedModule)
 
     return apply_to_collection(collection, dtype=tuple(types), function=_unwrap)
 
 
-def _unwrap_compiled(obj: Union[Any, "OptimizedModule"]) -> Tuple[Union[Any, nn.Module], Optional[Dict[str, Any]]]:
+def _unwrap_compiled(obj: Union[Any, OptimizedModule]) -> Tuple[Union[Any, nn.Module], Optional[Dict[str, Any]]]:
     """Removes the :class:`torch._dynamo.OptimizedModule` around the object if it is wrapped.
 
     Use this function before instance checks against e.g. :class:`_FabricModule`.
 
     """
-    if not _TORCH_GREATER_EQUAL_2_0:
-        # obj can't be an `OptimizedModule` anyway
-        return obj, None
-
-    from torch._dynamo import OptimizedModule
-
     if isinstance(obj, OptimizedModule):
         if (compile_kwargs := getattr(obj, "_compile_kwargs", None)) is None:
             raise RuntimeError(
@@ -359,10 +346,7 @@ def _unwrap_compiled(obj: Union[Any, "OptimizedModule"]) -> Tuple[Union[Any, nn.
     return obj, None
 
 
-def _to_compiled(module: nn.Module, compile_kwargs: Dict[str, Any]) -> "OptimizedModule":
-    if not _TORCH_GREATER_EQUAL_2_0:
-        raise RuntimeError("Converting to a compiled module is only supported in PyTorch >= 2.0.0")
-
+def _to_compiled(module: nn.Module, compile_kwargs: Dict[str, Any]) -> OptimizedModule:
     return torch.compile(module, **compile_kwargs)  # type: ignore[return-value]
 
 
@@ -414,5 +398,4 @@ def _capture(*args: Any, **kwargs: Any) -> Any:
     return _capture
 
 
-if _TORCH_GREATER_EQUAL_2_0:
-    torch.compile = _capture_compile_kwargs(torch.compile)
+torch.compile = _capture_compile_kwargs(torch.compile)
diff --git a/src/lightning/pytorch/CHANGELOG.md b/src/lightning/pytorch/CHANGELOG.md
index 3838a6258b052..e4ae5a29c336c 100644
--- a/src/lightning/pytorch/CHANGELOG.md
+++ b/src/lightning/pytorch/CHANGELOG.md
@@ -38,7 +38,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 
 - Removed the Bagua integration (`Trainer(strategy="bagua")`) ([#19445](https://github.com/Lightning-AI/lightning/pull/19445))
 
--
+- Removed support for PyTorch 1.13 ([#19706](https://github.com/Lightning-AI/lightning/pull/19706))
 
 -
 
diff --git a/src/lightning/pytorch/callbacks/stochastic_weight_avg.py b/src/lightning/pytorch/callbacks/stochastic_weight_avg.py
index 731f161683102..c3d5cf4496fe5 100644
--- a/src/lightning/pytorch/callbacks/stochastic_weight_avg.py
+++ b/src/lightning/pytorch/callbacks/stochastic_weight_avg.py
@@ -21,11 +21,11 @@
 
 import torch
 from torch import Tensor, nn
+from torch.optim.lr_scheduler import LRScheduler
 from torch.optim.swa_utils import SWALR
 from typing_extensions import override
 
 import lightning.pytorch as pl
-from lightning.fabric.utilities.types import LRScheduler
 from lightning.pytorch.callbacks.callback import Callback
 from lightning.pytorch.strategies import DeepSpeedStrategy
 from lightning.pytorch.strategies.fsdp import FSDPStrategy
diff --git a/src/lightning/pytorch/cli.py b/src/lightning/pytorch/cli.py
index a6854b9bf6d89..09f025b988089 100644
--- a/src/lightning/pytorch/cli.py
+++ b/src/lightning/pytorch/cli.py
@@ -23,11 +23,11 @@
 from lightning_utilities.core.imports import RequirementCache
 from lightning_utilities.core.rank_zero import _warn
 from torch.optim import Optimizer
+from torch.optim.lr_scheduler import LRScheduler
 from typing_extensions import override
 
 import lightning.pytorch as pl
 from lightning.fabric.utilities.cloud_io import get_filesystem
-from lightning.fabric.utilities.types import _TORCH_LRSCHEDULER
 from lightning.pytorch import Callback, LightningDataModule, LightningModule, Trainer, seed_everything
 from lightning.pytorch.core.mixins.hparams_mixin import _given_hyperparameters_context
 from lightning.pytorch.utilities.exceptions import MisconfigurationException
@@ -63,15 +63,15 @@ def __init__(self, optimizer: Optimizer, monitor: str, *args: Any, **kwargs: Any
 
 
 # LightningCLI requires the ReduceLROnPlateau defined here, thus it shouldn't accept the one from pytorch:
-LRSchedulerTypeTuple = (_TORCH_LRSCHEDULER, ReduceLROnPlateau)
-LRSchedulerTypeUnion = Union[_TORCH_LRSCHEDULER, ReduceLROnPlateau]
-LRSchedulerType = Union[Type[_TORCH_LRSCHEDULER], Type[ReduceLROnPlateau]]
+LRSchedulerTypeTuple = (LRScheduler, ReduceLROnPlateau)
+LRSchedulerTypeUnion = Union[LRScheduler, ReduceLROnPlateau]
+LRSchedulerType = Union[Type[LRScheduler], Type[ReduceLROnPlateau]]
 
 
 # Type aliases intended for convenience of CLI developers
 ArgsType = Optional[Union[List[str], Dict[str, Any], Namespace]]
 OptimizerCallable = Callable[[Iterable], Optimizer]
-LRSchedulerCallable = Callable[[Optimizer], Union[_TORCH_LRSCHEDULER, ReduceLROnPlateau]]
+LRSchedulerCallable = Callable[[Optimizer], Union[LRScheduler, ReduceLROnPlateau]]
 
 
 class LightningArgumentParser(ArgumentParser):
diff --git a/src/lightning/pytorch/core/hooks.py b/src/lightning/pytorch/core/hooks.py
index 4a4cad3d5f080..5495a0262036d 100644
--- a/src/lightning/pytorch/core/hooks.py
+++ b/src/lightning/pytorch/core/hooks.py
@@ -19,7 +19,6 @@
 from torch import Tensor
 from torch.optim.optimizer import Optimizer
 
-from lightning.fabric.utilities.imports import _TORCH_GREATER_EQUAL_2_0
 from lightning.pytorch.utilities import move_data_to_device
 from lightning.pytorch.utilities.exceptions import MisconfigurationException
 from lightning.pytorch.utilities.types import EVAL_DATALOADERS, STEP_OUTPUT, TRAIN_DATALOADERS
@@ -158,8 +157,7 @@ def on_predict_batch_end(self, outputs: Optional[Any], batch: Any, batch_idx: in
 
     def on_validation_model_zero_grad(self) -> None:
         """Called by the training loop to release gradients before entering the validation loop."""
-        zero_grad_kwargs = {} if _TORCH_GREATER_EQUAL_2_0 else {"set_to_none": True}
-        self.zero_grad(**zero_grad_kwargs)
+        self.zero_grad()
 
     def on_validation_model_eval(self) -> None:
         """Called when the validation loop starts.
diff --git a/src/lightning/pytorch/core/module.py b/src/lightning/pytorch/core/module.py
index faeda00ce5aa9..3cb55566fb8b7 100644
--- a/src/lightning/pytorch/core/module.py
+++ b/src/lightning/pytorch/core/module.py
@@ -50,7 +50,7 @@
 from lightning.fabric.utilities.apply_func import convert_to_tensors
 from lightning.fabric.utilities.cloud_io import get_filesystem
 from lightning.fabric.utilities.device_dtype_mixin import _DeviceDtypeModuleMixin
-from lightning.fabric.utilities.imports import _IS_WINDOWS, _TORCH_GREATER_EQUAL_2_0, _TORCH_GREATER_EQUAL_2_1
+from lightning.fabric.utilities.imports import _IS_WINDOWS, _TORCH_GREATER_EQUAL_2_1
 from lightning.fabric.utilities.types import _MAP_LOCATION_TYPE, _PATH
 from lightning.fabric.wrappers import _FabricOptimizer
 from lightning.pytorch.callbacks.callback import Callback
@@ -217,9 +217,6 @@ def trainer(self, trainer: Optional["pl.Trainer"]) -> None:
         for v in self.children():
             if isinstance(v, LightningModule):
                 v.trainer = trainer  # type: ignore[assignment]
-        # https://github.com/pytorch/pytorch/issues/95857
-        if not _TORCH_GREATER_EQUAL_2_0 and trainer is not None and not isinstance(trainer, weakref.ProxyTypes):
-            trainer = weakref.proxy(trainer)
         self._trainer = trainer
 
     @property
@@ -1377,7 +1374,7 @@ def forward(self, x):
             model.to_onnx("export.onnx", input_sample, export_params=True)
 
         """
-        if _TORCH_GREATER_EQUAL_2_0 and not _ONNX_AVAILABLE:
+        if not _ONNX_AVAILABLE:
             raise ModuleNotFoundError(
                 f"`torch>=2.0` requires `onnx` to be installed to use `{type(self).__name__}.to_onnx()`"
             )
diff --git a/src/lightning/pytorch/core/optimizer.py b/src/lightning/pytorch/core/optimizer.py
index b7a63a8e17cab..777dca0b51dfe 100644
--- a/src/lightning/pytorch/core/optimizer.py
+++ b/src/lightning/pytorch/core/optimizer.py
@@ -19,10 +19,11 @@
 import torch
 from torch import optim
 from torch.optim import Optimizer
+from torch.optim.lr_scheduler import ReduceLROnPlateau
 from typing_extensions import override
 
 import lightning.pytorch as pl
-from lightning.fabric.utilities.types import Optimizable, ReduceLROnPlateau, _Stateful
+from lightning.fabric.utilities.types import Optimizable, _Stateful
 from lightning.pytorch.utilities.exceptions import MisconfigurationException
 from lightning.pytorch.utilities.model_helpers import is_overridden
 from lightning.pytorch.utilities.rank_zero import rank_zero_warn
diff --git a/src/lightning/pytorch/demos/boring_classes.py b/src/lightning/pytorch/demos/boring_classes.py
index 3dd7bd8b1afc8..fd2660228146e 100644
--- a/src/lightning/pytorch/demos/boring_classes.py
+++ b/src/lightning/pytorch/demos/boring_classes.py
@@ -18,9 +18,9 @@
 import torch.nn.functional as F
 from torch import Tensor
 from torch.optim import Optimizer
+from torch.optim.lr_scheduler import LRScheduler
 from torch.utils.data import DataLoader, Dataset, IterableDataset, Subset
 
-from lightning.fabric.utilities.types import _TORCH_LRSCHEDULER
 from lightning.pytorch import LightningDataModule, LightningModule
 from lightning.pytorch.core.optimizer import LightningOptimizer
 from lightning.pytorch.utilities.types import STEP_OUTPUT
@@ -134,7 +134,7 @@ def validation_step(self, batch: Any, batch_idx: int) -> STEP_OUTPUT:
     def test_step(self, batch: Any, batch_idx: int) -> STEP_OUTPUT:
         return {"y": self.step(batch)}
 
-    def configure_optimizers(self) -> Tuple[List[torch.optim.Optimizer], List[_TORCH_LRSCHEDULER]]:
+    def configure_optimizers(self) -> Tuple[List[torch.optim.Optimizer], List[LRScheduler]]:
         optimizer = torch.optim.SGD(self.parameters(), lr=0.1)
         lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=1)
         return [optimizer], [lr_scheduler]
diff --git a/src/lightning/pytorch/plugins/precision/amp.py b/src/lightning/pytorch/plugins/precision/amp.py
index 70ce9a87fb37a..c0a309f070ef6 100644
--- a/src/lightning/pytorch/plugins/precision/amp.py
+++ b/src/lightning/pytorch/plugins/precision/amp.py
@@ -18,7 +18,6 @@
 from typing_extensions import override
 
 import lightning.pytorch as pl
-from lightning.fabric.accelerators.cuda import _patch_cuda_is_available
 from lightning.fabric.plugins.precision.amp import _optimizer_handles_unscaling
 from lightning.fabric.utilities.types import Optimizable
 from lightning.pytorch.plugins.precision.precision import Precision
@@ -50,9 +49,7 @@ def __init__(
 
         self.precision = precision
         if scaler is None and self.precision == "16-mixed":
-            with _patch_cuda_is_available():
-                # if possible, we defer CUDA initialization to support strategies that will attempt forks
-                scaler = torch.cuda.amp.GradScaler()
+            scaler = torch.cuda.amp.GradScaler()
         if scaler is not None and self.precision == "bf16-mixed":
             raise MisconfigurationException(f"`precision='bf16-mixed'` does not use a scaler, found {scaler}.")
         self.device = device
diff --git a/src/lightning/pytorch/plugins/precision/fsdp.py b/src/lightning/pytorch/plugins/precision/fsdp.py
index c41199adb480e..e6c684967ed40 100644
--- a/src/lightning/pytorch/plugins/precision/fsdp.py
+++ b/src/lightning/pytorch/plugins/precision/fsdp.py
@@ -22,7 +22,6 @@
 from lightning.fabric.plugins.precision.amp import _optimizer_handles_unscaling
 from lightning.fabric.plugins.precision.fsdp import _PRECISION_INPUT
 from lightning.fabric.plugins.precision.utils import _convert_fp_tensor, _DtypeContextManager
-from lightning.fabric.utilities.imports import _TORCH_GREATER_EQUAL_2_0
 from lightning.fabric.utilities.types import Optimizable
 from lightning.pytorch.plugins.precision.precision import Precision
 from lightning.pytorch.utilities.exceptions import MisconfigurationException
@@ -87,21 +86,18 @@ def clip_grad_by_norm(self, *_: Any, **__: Any) -> None:
     def mixed_precision_config(self) -> "TorchMixedPrecision":
         from torch.distributed.fsdp.fully_sharded_data_parallel import MixedPrecision as TorchMixedPrecision
 
-        # With PyTorch < 2.0, FSDP uses the noneness of `param_dtype` as a proxy for the `_uses_param_mixed_precision`
-        # property. In order to avoid FSDP assertion failures, we therefore avoid setting `param_dtype` to
-        # `torch.float32` here with PyTorch < 2.0.
         if self.precision == "16-mixed":
-            param_dtype = None if not _TORCH_GREATER_EQUAL_2_0 else torch.float32
+            param_dtype = torch.float32
             reduce_dtype = buffer_dtype = torch.float16
         elif self.precision == "bf16-mixed":
-            param_dtype = None if not _TORCH_GREATER_EQUAL_2_0 else torch.float32
+            param_dtype = torch.float32
             reduce_dtype = buffer_dtype = torch.bfloat16
         elif self.precision == "16-true":
             param_dtype = reduce_dtype = buffer_dtype = torch.float16
         elif self.precision == "bf16-true":
             param_dtype = reduce_dtype = buffer_dtype = torch.bfloat16
         elif self.precision == "32-true":
-            param_dtype = None if not _TORCH_GREATER_EQUAL_2_0 else torch.float32
+            param_dtype = torch.float32
             reduce_dtype = buffer_dtype = torch.float32
         else:
             raise MisconfigurationException(f"Was unable to infer precision type, received {self.precision!r}.")
diff --git a/src/lightning/pytorch/strategies/deepspeed.py b/src/lightning/pytorch/strategies/deepspeed.py
index 2cc099be39d26..6be3d3f8ba590 100644
--- a/src/lightning/pytorch/strategies/deepspeed.py
+++ b/src/lightning/pytorch/strategies/deepspeed.py
@@ -24,6 +24,7 @@
 import torch
 from torch.nn import Module
 from torch.optim import Optimizer
+from torch.optim.lr_scheduler import LRScheduler, ReduceLROnPlateau
 from typing_extensions import override
 
 import lightning.pytorch as pl
@@ -37,7 +38,7 @@
 )
 from lightning.fabric.utilities.optimizer import _optimizers_to_device
 from lightning.fabric.utilities.seed import reset_seed
-from lightning.fabric.utilities.types import _PATH, LRScheduler, ReduceLROnPlateau
+from lightning.fabric.utilities.types import _PATH
 from lightning.pytorch.accelerators.cuda import CUDAAccelerator
 from lightning.pytorch.core.optimizer import _init_optimizers_and_lr_schedulers
 from lightning.pytorch.plugins.precision import Precision
diff --git a/src/lightning/pytorch/strategies/fsdp.py b/src/lightning/pytorch/strategies/fsdp.py
index 4c6f1fec5fe15..657fb438c1e8a 100644
--- a/src/lightning/pytorch/strategies/fsdp.py
+++ b/src/lightning/pytorch/strategies/fsdp.py
@@ -54,10 +54,7 @@
     _sync_ddp_if_available,
 )
 from lightning.fabric.utilities.distributed import group as _group
-from lightning.fabric.utilities.imports import (
-    _TORCH_GREATER_EQUAL_2_0,
-    _TORCH_GREATER_EQUAL_2_1,
-)
+from lightning.fabric.utilities.imports import _TORCH_GREATER_EQUAL_2_1
 from lightning.fabric.utilities.init import _EmptyInit
 from lightning.fabric.utilities.load import _lazy_load, _materialize_tensors
 from lightning.fabric.utilities.optimizer import _optimizers_to_device
@@ -75,14 +72,9 @@
 
 if TYPE_CHECKING:
     from torch.distributed.fsdp.fully_sharded_data_parallel import CPUOffload, MixedPrecision, ShardingStrategy
+    from torch.distributed.fsdp.wrap import ModuleWrapPolicy
 
-    if _TORCH_GREATER_EQUAL_2_0:
-        from torch.distributed.fsdp.wrap import ModuleWrapPolicy
-
-        _POLICY = Union[Set[Type[Module]], Callable[[Module, bool, int], bool], ModuleWrapPolicy]
-    else:
-        _POLICY = Union[Set[Type[Module]], Callable[[Module, bool, int], bool]]  # type: ignore[misc]
-
+    _POLICY = Union[Set[Type[Module]], Callable[[Module, bool, int], bool], ModuleWrapPolicy]
     _SHARDING_STRATEGY = Union[ShardingStrategy, Literal["FULL_SHARD", "SHARD_GRAD_OP", "NO_SHARD", "HYBRID_SHARD"]]
 
 
@@ -175,20 +167,13 @@ def __init__(
         self.kwargs = _auto_wrap_policy_kwargs(auto_wrap_policy, kwargs)
         self.sharding_strategy = _init_sharding_strategy(sharding_strategy, self.kwargs)
 
-        if _TORCH_GREATER_EQUAL_2_0:
-            # Avoids the need for user to reference params in `configure_optimizers` via
-            # `self.trainer.model.parameters()` and enables support for multiple parameter groups.
-            self.kwargs.setdefault("use_orig_params", True)
+        # Avoids the need for user to reference params in `configure_optimizers` via
+        # `self.trainer.model.parameters()` and enables support for multiple parameter groups.
+        self.kwargs.setdefault("use_orig_params", True)
 
         self._activation_checkpointing_kwargs = _activation_checkpointing_kwargs(
             activation_checkpointing, activation_checkpointing_policy
         )
-
-        if state_dict_type == "sharded" and not _TORCH_GREATER_EQUAL_2_0:
-            raise NotImplementedError(
-                "Saving checkpoints with `FSDPStrategy(state_dict_type='sharded')` is not supported in PyTorch < 2.0."
-                " Please upgrade `torch`."
-            )
         self._state_dict_type = state_dict_type
 
     @property
@@ -517,10 +502,6 @@ def load_model_state_dict(self, checkpoint: Mapping[str, Any], strict: bool = Tr
 
     @override
     def optimizer_state(self, optimizer: Optimizer) -> Dict[str, Tensor]:
-        if not _TORCH_GREATER_EQUAL_2_0:
-            rank_zero_warn("FSDP in Lightning with PyTorch < 2.0 does not support saving the optimizer state.")
-            return {}
-
         from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
         from torch.distributed.fsdp import OptimStateKeyType
 
@@ -629,7 +610,7 @@ def load_checkpoint(self, checkpoint_path: _PATH) -> Dict[str, Any]:
             return metadata
 
         if _is_full_checkpoint(path):
-            checkpoint = _lazy_load(path) if _TORCH_GREATER_EQUAL_2_0 else torch.load(path, map_location="cpu")
+            checkpoint = _lazy_load(path)
             _load_raw_module_state(
                 checkpoint.pop("state_dict"),
                 module=self.model,
@@ -637,10 +618,9 @@ def load_checkpoint(self, checkpoint_path: _PATH) -> Dict[str, Any]:
                 strict=self.lightning_module.strict_loading,
             )
 
-            if _TORCH_GREATER_EQUAL_2_0:
-                # Materialize lazy tensors if there are any left in the checkpoint
-                # The `torch.Optimizer.load_state_dict` method can't load lazy tensors because of deepcopy pickle issues
-                checkpoint = _materialize_tensors(checkpoint)
+            # Materialize lazy tensors if there are any left in the checkpoint
+            # The `torch.Optimizer.load_state_dict` method can't load lazy tensors because of deepcopy pickle issues
+            checkpoint = _materialize_tensors(checkpoint)
 
             from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
             from torch.distributed.fsdp import OptimStateKeyType
@@ -649,9 +629,6 @@ def load_checkpoint(self, checkpoint_path: _PATH) -> Dict[str, Any]:
             if optimizer_states is None or self.lightning_module.trainer.state.fn != TrainerFn.FITTING:
                 # If the optimizer states are not present, we don't need to do anything (backward compatibility)
                 return checkpoint
-            if not _TORCH_GREATER_EQUAL_2_0:
-                rank_zero_warn("FSDP in Lightning with PyTorch < 2.0 does not support loading the optimizer state.")
-                return checkpoint
             if len(self.optimizers) != len(optimizer_states):
                 raise RuntimeError(
                     f"You have configured {len(self.optimizers)} optimizers but the checkpoint contains"
diff --git a/src/lightning/pytorch/strategies/strategy.py b/src/lightning/pytorch/strategies/strategy.py
index f2acd8ac98eba..9534822939f66 100644
--- a/src/lightning/pytorch/strategies/strategy.py
+++ b/src/lightning/pytorch/strategies/strategy.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 import logging
 from abc import ABC, abstractmethod
-from contextlib import contextmanager, nullcontext
+from contextlib import contextmanager
 from typing import Any, Callable, Dict, Generator, List, Mapping, Optional, Tuple, TypeVar, Union
 
 import torch
@@ -26,7 +26,6 @@
 from lightning.fabric.strategies import _StrategyRegistry
 from lightning.fabric.utilities import move_data_to_device
 from lightning.fabric.utilities.distributed import ReduceOp
-from lightning.fabric.utilities.imports import _TORCH_GREATER_EQUAL_2_0
 from lightning.fabric.utilities.init import _EmptyInit
 from lightning.fabric.utilities.optimizer import _optimizer_to_device, _optimizers_to_device
 from lightning.fabric.utilities.types import _PATH
@@ -509,9 +508,8 @@ def tensor_init_context(self, empty_init: Optional[bool] = None) -> Generator[No
                 If ``None``, the strategy will decide. Some strategies may not support all options.
 
         """
-        device_context = self.root_device if _TORCH_GREATER_EQUAL_2_0 else nullcontext()
         empty_init_context = _EmptyInit(enabled=bool(empty_init))
-        with empty_init_context, device_context, self.precision_plugin.tensor_init_context():
+        with empty_init_context, self.root_device, self.precision_plugin.tensor_init_context():
             yield
 
     @contextmanager
diff --git a/src/lightning/pytorch/trainer/connectors/logger_connector/result.py b/src/lightning/pytorch/trainer/connectors/logger_connector/result.py
index 7e0ef433031bd..d7320c2c2e251 100644
--- a/src/lightning/pytorch/trainer/connectors/logger_connector/result.py
+++ b/src/lightning/pytorch/trainer/connectors/logger_connector/result.py
@@ -24,7 +24,7 @@
 from lightning.fabric.utilities import move_data_to_device
 from lightning.fabric.utilities.apply_func import convert_tensors_to_scalars
 from lightning.fabric.utilities.distributed import _distributed_is_initialized
-from lightning.fabric.utilities.imports import _TORCH_EQUAL_2_0, _TORCH_GREATER_EQUAL_2_0
+from lightning.fabric.utilities.imports import _TORCH_EQUAL_2_0
 from lightning.pytorch.utilities.data import extract_batch_size
 from lightning.pytorch.utilities.exceptions import MisconfigurationException
 from lightning.pytorch.utilities.imports import _TORCHMETRICS_GREATER_EQUAL_1_0_0
@@ -305,9 +305,7 @@ def __repr__(self) -> str:
 
     @override
     def to(self, *args: Any, **kwargs: Any) -> "_ResultMetric":
-        d = self.__dict__
-        if _TORCH_GREATER_EQUAL_2_0:  # https://github.com/pytorch/pytorch/issues/96198
-            d = dict(d)
+        d = dict(self.__dict__)
         self.__dict__.update(apply_to_collection(d, (Tensor, Metric), move_data_to_device, *args, **kwargs))
         return self
 
diff --git a/src/lightning/pytorch/trainer/trainer.py b/src/lightning/pytorch/trainer/trainer.py
index 6436fc54b7bed..bf7d47a880da3 100644
--- a/src/lightning/pytorch/trainer/trainer.py
+++ b/src/lightning/pytorch/trainer/trainer.py
@@ -35,7 +35,6 @@
 import lightning.pytorch as pl
 from lightning.fabric.utilities.apply_func import convert_tensors_to_scalars
 from lightning.fabric.utilities.cloud_io import _is_local_file_protocol
-from lightning.fabric.utilities.imports import _TORCH_GREATER_EQUAL_2_0
 from lightning.fabric.utilities.types import _PATH
 from lightning.pytorch.accelerators import Accelerator
 from lightning.pytorch.callbacks import Callback, Checkpoint, EarlyStopping, ProgressBar
@@ -1018,9 +1017,7 @@ def _teardown(self) -> None:
     def _run_stage(self) -> Optional[Union[_PREDICT_OUTPUT, _EVALUATE_OUTPUT]]:
         # wait for all to join if on distributed
         self.strategy.barrier("run-stage")
-
-        zero_grad_kwargs = {} if _TORCH_GREATER_EQUAL_2_0 else {"set_to_none": True}
-        self.lightning_module.zero_grad(**zero_grad_kwargs)
+        self.lightning_module.zero_grad()
 
         if self.evaluating:
             return self._evaluation_loop.run()
@@ -1084,8 +1081,7 @@ def init_module(self, empty_init: Optional[bool] = None) -> Generator:
         the right data type depending on the precision setting in the Trainer.
 
         The parameters and tensors get created on the device and with the right data type right away without wasting
-        memory being allocated unnecessarily. The automatic device placement under this context manager is only
-        supported with PyTorch 2.0 and newer.
+        memory being allocated unnecessarily.
 
         Args:
             empty_init: Whether to initialize the model with empty weights (uninitialized memory).
@@ -1093,13 +1089,6 @@ def init_module(self, empty_init: Optional[bool] = None) -> Generator:
                 Set this to ``True`` if you are loading a checkpoint into a large model.
 
         """
-        if not _TORCH_GREATER_EQUAL_2_0 and self.strategy.root_device.type != "cpu":
-            rank_zero_warn(
-                "`Trainer.init_module()` can't place tensors on the device directly"
-                " with PyTorch < 2.0. Parameters will remain on CPU until the trainer starts."
-                " Upgrade to PyTorch >= 2.0 to fully utilize this feature.",
-                category=PossibleUserWarning,
-            )
         if is_overridden("model_sharded_context", self.strategy, parent=Strategy):
             # warning instead of error so that code changes are not required when changing strategies
             # this is a limitation because processes are not expected to have been launched when this is called
diff --git a/src/lightning/pytorch/tuner/lr_finder.py b/src/lightning/pytorch/tuner/lr_finder.py
index f39788b8ea290..8eebd3cd7f974 100644
--- a/src/lightning/pytorch/tuner/lr_finder.py
+++ b/src/lightning/pytorch/tuner/lr_finder.py
@@ -16,19 +16,19 @@
 import os
 import uuid
 from copy import deepcopy
-from typing import TYPE_CHECKING, Any, Dict, List, Optional, Union, cast
+from typing import TYPE_CHECKING, Any, Dict, List, Optional, Union
 
 import torch
 from lightning_utilities.core.imports import RequirementCache
+from torch.optim.lr_scheduler import LRScheduler
 from typing_extensions import override
 
 import lightning.pytorch as pl
-from lightning.fabric.utilities.types import _TORCH_LRSCHEDULER
 from lightning.pytorch.callbacks import Callback
 from lightning.pytorch.utilities.exceptions import MisconfigurationException
 from lightning.pytorch.utilities.parsing import lightning_hasattr, lightning_setattr
 from lightning.pytorch.utilities.rank_zero import rank_zero_warn
-from lightning.pytorch.utilities.types import STEP_OUTPUT, LRScheduler, LRSchedulerConfig
+from lightning.pytorch.utilities.types import STEP_OUTPUT, LRSchedulerConfig
 
 # check if ipywidgets is installed before importing tqdm.auto
 # to ensure it won't fail and a progress bar is displayed
@@ -127,7 +127,6 @@ def _exchange_scheduler(self, trainer: "pl.Trainer") -> None:
 
         args = (optimizer, self.lr_max, self.num_training)
         scheduler = _LinearLR(*args) if self.mode == "linear" else _ExponentialLR(*args)
-        scheduler = cast(LRScheduler, scheduler)
 
         trainer.strategy.optimizers = [optimizer]
         trainer.strategy.lr_scheduler_configs = [LRSchedulerConfig(scheduler, interval="step")]
@@ -439,7 +438,7 @@ def on_train_batch_end(
         self.losses.append(smoothed_loss)
 
 
-class _LinearLR(_TORCH_LRSCHEDULER):
+class _LinearLR(LRScheduler):
     """Linearly increases the learning rate between two boundaries over a number of iterations.
 
     Args:
@@ -459,9 +458,8 @@ def __init__(self, optimizer: torch.optim.Optimizer, end_lr: float, num_iter: in
         self.num_iter = num_iter
         super().__init__(optimizer, last_epoch)
 
-    # mypy can't follow the _TORCH_LRSCHEDULER TypeAlias, so ignore "no base method" error
-    @override  # type: ignore[misc]
-    def get_lr(self) -> List[float]:
+    @override
+    def get_lr(self) -> List[float]:  # type: ignore[override]
         curr_iter = self.last_epoch + 1
         r = curr_iter / self.num_iter
 
@@ -477,7 +475,7 @@ def lr(self) -> Union[float, List[float]]:
         return self._lr
 
 
-class _ExponentialLR(_TORCH_LRSCHEDULER):
+class _ExponentialLR(LRScheduler):
     """Exponentially increases the learning rate between two boundaries over a number of iterations.
 
     Arguments:
@@ -497,9 +495,8 @@ def __init__(self, optimizer: torch.optim.Optimizer, end_lr: float, num_iter: in
         self.num_iter = num_iter
         super().__init__(optimizer, last_epoch)
 
-    # mypy can't follow the _TORCH_LRSCHEDULER TypeAlias, so ignore "no base method" error
-    @override  # type: ignore[misc]
-    def get_lr(self) -> List[float]:
+    @override
+    def get_lr(self) -> List[float]:  # type: ignore[override]
         curr_iter = self.last_epoch + 1
         r = curr_iter / self.num_iter
 
diff --git a/src/lightning/pytorch/utilities/compile.py b/src/lightning/pytorch/utilities/compile.py
index a77ed553d418e..7c5a8067740a4 100644
--- a/src/lightning/pytorch/utilities/compile.py
+++ b/src/lightning/pytorch/utilities/compile.py
@@ -14,14 +14,15 @@
 from typing import Union
 
 import torch
+from torch._dynamo import OptimizedModule
 
 import lightning.pytorch as pl
-from lightning.fabric.utilities.imports import _TORCH_GREATER_EQUAL_2_0, _TORCH_GREATER_EQUAL_2_1
+from lightning.fabric.utilities.imports import _TORCH_GREATER_EQUAL_2_1
 from lightning.pytorch.strategies import DDPStrategy, DeepSpeedStrategy, FSDPStrategy, SingleDeviceStrategy, Strategy
 from lightning.pytorch.utilities.model_helpers import _check_mixed_imports
 
 
-def from_compiled(model: "torch._dynamo.OptimizedModule") -> "pl.LightningModule":
+def from_compiled(model: OptimizedModule) -> "pl.LightningModule":
     """Returns an instance LightningModule from the output of ``torch.compile``.
 
     .. warning::  This is an :ref:`experimental <versioning:Experimental API>` feature.
@@ -33,11 +34,6 @@ def from_compiled(model: "torch._dynamo.OptimizedModule") -> "pl.LightningModule
     Use this method to obtain a LightningModule that still runs with all the optimizations from ``torch.compile``.
 
     """
-    if not _TORCH_GREATER_EQUAL_2_0:
-        raise ModuleNotFoundError("`from_compiled` requires torch>=2.0")
-
-    from torch._dynamo import OptimizedModule
-
     if not isinstance(model, OptimizedModule):
         raise ValueError(f"`model` is required to be a `OptimizedModule`. Found a `{type(model).__name__}` instead.")
 
@@ -82,11 +78,6 @@ def to_uncompiled(model: Union["pl.LightningModule", "torch._dynamo.OptimizedMod
     Note: this method will in-place modify the ``LightningModule`` that is passed in.
 
     """
-    if not _TORCH_GREATER_EQUAL_2_0:
-        raise ModuleNotFoundError("`to_uncompiled` requires torch>=2.0")
-
-    from torch._dynamo import OptimizedModule
-
     if isinstance(model, OptimizedModule):
         original = model._orig_mod
         if not isinstance(original, pl.LightningModule):
@@ -117,13 +108,6 @@ def to_uncompiled(model: Union["pl.LightningModule", "torch._dynamo.OptimizedMod
 
 
 def _maybe_unwrap_optimized(model: object) -> "pl.LightningModule":
-    if not _TORCH_GREATER_EQUAL_2_0:
-        if not isinstance(model, pl.LightningModule):
-            _check_mixed_imports(model)
-            raise TypeError(f"`model` must be a `LightningModule`, got `{type(model).__qualname__}`")
-        return model
-    from torch._dynamo import OptimizedModule
-
     if isinstance(model, OptimizedModule):
         return from_compiled(model)
     if isinstance(model, pl.LightningModule):
diff --git a/src/lightning/pytorch/utilities/model_summary/model_summary.py b/src/lightning/pytorch/utilities/model_summary/model_summary.py
index ef2827d3b7eed..806724e1c434a 100644
--- a/src/lightning/pytorch/utilities/model_summary/model_summary.py
+++ b/src/lightning/pytorch/utilities/model_summary/model_summary.py
@@ -25,7 +25,6 @@
 from torch.utils.hooks import RemovableHandle
 
 import lightning.pytorch as pl
-from lightning.fabric.utilities.imports import _TORCH_GREATER_EQUAL_2_0
 from lightning.pytorch.utilities.model_helpers import _ModuleMode
 from lightning.pytorch.utilities.rank_zero import WarningCache
 
@@ -107,10 +106,7 @@ def hook_with_kwargs(_: nn.Module, args: Any, kwargs: Any, out: Any) -> None:
 
         handle = None
         if not isinstance(self._module, torch.jit.ScriptModule):
-            if _TORCH_GREATER_EQUAL_2_0:
-                handle = self._module.register_forward_hook(hook_with_kwargs, with_kwargs=True)
-            else:
-                handle = self._module.register_forward_hook(hook)
+            handle = self._module.register_forward_hook(hook_with_kwargs, with_kwargs=True)
 
         return handle
 
diff --git a/src/lightning/pytorch/utilities/testing/_runif.py b/src/lightning/pytorch/utilities/testing/_runif.py
index c3e0262d9906f..03b3afd61b875 100644
--- a/src/lightning/pytorch/utilities/testing/_runif.py
+++ b/src/lightning/pytorch/utilities/testing/_runif.py
@@ -15,7 +15,6 @@
 
 from lightning_utilities.core.imports import RequirementCache
 
-from lightning.fabric.utilities.imports import _TORCH_GREATER_EQUAL_2_0
 from lightning.fabric.utilities.testing import _runif_reasons as fabric_run_if
 from lightning.pytorch.accelerators.cpu import _PSUTIL_AVAILABLE
 from lightning.pytorch.callbacks.progress.rich_progress import _RICH_AVAILABLE
@@ -94,7 +93,7 @@ def _runif_reasons(
     if sklearn and not _SKLEARN_AVAILABLE:
         reasons.append("scikit-learn")
 
-    if onnx and _TORCH_GREATER_EQUAL_2_0 and not _ONNX_AVAILABLE:
+    if onnx and not _ONNX_AVAILABLE:
         reasons.append("onnx")
 
     return reasons, kwargs
diff --git a/src/lightning/pytorch/utilities/types.py b/src/lightning/pytorch/utilities/types.py
index 203df53f22ba7..bc75e0f50aeb0 100644
--- a/src/lightning/pytorch/utilities/types.py
+++ b/src/lightning/pytorch/utilities/types.py
@@ -38,10 +38,11 @@
 import torch
 from torch import Tensor
 from torch.optim import Optimizer
+from torch.optim.lr_scheduler import LRScheduler, ReduceLROnPlateau
 from torchmetrics import Metric
 from typing_extensions import NotRequired, Required
 
-from lightning.fabric.utilities.types import _TORCH_LRSCHEDULER, LRScheduler, ProcessGroup, ReduceLROnPlateau
+from lightning.fabric.utilities.types import ProcessGroup
 
 _NUMBER = Union[int, float]
 _METRIC = Union[Metric, Tensor, _NUMBER]
@@ -76,15 +77,15 @@ def no_sync(self) -> Generator: ...
 
 
 # todo: improve LRSchedulerType naming/typing
-LRSchedulerTypeTuple = (_TORCH_LRSCHEDULER, torch.optim.lr_scheduler.ReduceLROnPlateau)
-LRSchedulerTypeUnion = Union[_TORCH_LRSCHEDULER, torch.optim.lr_scheduler.ReduceLROnPlateau]
-LRSchedulerType = Union[Type[_TORCH_LRSCHEDULER], Type[torch.optim.lr_scheduler.ReduceLROnPlateau]]
+LRSchedulerTypeTuple = (LRScheduler, ReduceLROnPlateau)
+LRSchedulerTypeUnion = Union[LRScheduler, ReduceLROnPlateau]
+LRSchedulerType = Union[Type[LRScheduler], Type[ReduceLROnPlateau]]
 LRSchedulerPLType = Union[LRScheduler, ReduceLROnPlateau]
 
 
 @dataclass
 class LRSchedulerConfig:
-    scheduler: Union[_TORCH_LRSCHEDULER, ReduceLROnPlateau]
+    scheduler: Union[LRScheduler, ReduceLROnPlateau]
     # no custom name
     name: Optional[str] = None
     # after epoch is over
diff --git a/tests/tests_fabric/accelerators/test_cuda.py b/tests/tests_fabric/accelerators/test_cuda.py
index 4b2265670b8bf..e323ada908cd1 100644
--- a/tests/tests_fabric/accelerators/test_cuda.py
+++ b/tests/tests_fabric/accelerators/test_cuda.py
@@ -25,8 +25,6 @@
     CUDAAccelerator,
     _check_cuda_matmul_precision,
     find_usable_cuda_devices,
-    is_cuda_available,
-    num_cuda_devices,
 )
 
 from tests_fabric.helpers.runif import RunIf
@@ -67,18 +65,6 @@ def test_set_cuda_device(_, set_device_mock):
     set_device_mock.assert_called_once_with(device)
 
 
-@mock.patch("lightning.fabric.accelerators.cuda._device_count_nvml", return_value=-1)
-@mock.patch("torch.cuda.is_available", return_value=True)
-@mock.patch("torch.cuda.device_count", return_value=100)
-def test_num_cuda_devices_without_nvml(*_):
-    """Test that if NVML can't be loaded, our helper functions fall back to the default implementation for determining
-    CUDA availability."""
-    num_cuda_devices.cache_clear()
-    assert is_cuda_available()
-    assert num_cuda_devices() == 100
-    num_cuda_devices.cache_clear()
-
-
 @mock.patch.dict(os.environ, {}, clear=True)
 def test_force_nvml_based_cuda_check():
     """Test that we force PyTorch to use the NVML-based CUDA checks."""
diff --git a/tests/tests_fabric/plugins/precision/test_fsdp.py b/tests/tests_fabric/plugins/precision/test_fsdp.py
index 74c1034518c39..148292dcd48df 100644
--- a/tests/tests_fabric/plugins/precision/test_fsdp.py
+++ b/tests/tests_fabric/plugins/precision/test_fsdp.py
@@ -26,25 +26,9 @@
     [
         ("16-true", (torch.float16, torch.float16, torch.float16)),
         ("bf16-true", (torch.bfloat16, torch.bfloat16, torch.bfloat16)),
-        pytest.param(
-            "16-mixed", (torch.float32, torch.float16, torch.float16), marks=RunIf(min_torch="2.0"), id="16-mixed-ge2_0"
-        ),
-        pytest.param(
-            "16-mixed", (None, torch.float16, torch.float16), marks=RunIf(max_torch="2.0"), id="16-mixed-lt2_0"
-        ),
-        pytest.param(
-            "bf16-mixed",
-            (torch.float32, torch.bfloat16, torch.bfloat16),
-            marks=RunIf(min_torch="2.0"),
-            id="bf16-mixed-ge2_0",
-        ),
-        pytest.param(
-            "bf16-mixed", (None, torch.bfloat16, torch.bfloat16), marks=RunIf(max_torch="2.0"), id="bf16-mixed-lt2_0"
-        ),
-        pytest.param(
-            "32-true", (torch.float32, torch.float32, torch.float32), marks=RunIf(min_torch="2.0"), id="32-true-ge2_0"
-        ),
-        pytest.param("32-true", (None, torch.float32, torch.float32), marks=RunIf(max_torch="2.0"), id="32-true-lt2_0"),
+        ("16-mixed", (torch.float32, torch.float16, torch.float16)),
+        ("bf16-mixed", (torch.float32, torch.bfloat16, torch.bfloat16)),
+        ("32-true", (torch.float32, torch.float32, torch.float32)),
     ],
 )
 def test_fsdp_precision_config(precision, expected):
diff --git a/tests/tests_fabric/strategies/test_ddp.py b/tests/tests_fabric/strategies/test_ddp.py
index beea7eccb69c2..56d9875dfefed 100644
--- a/tests/tests_fabric/strategies/test_ddp.py
+++ b/tests/tests_fabric/strategies/test_ddp.py
@@ -23,7 +23,6 @@
 from lightning.fabric.plugins.environments import LightningEnvironment
 from lightning.fabric.strategies import DDPStrategy
 from lightning.fabric.strategies.ddp import _DDPBackwardSyncControl
-from lightning.fabric.utilities.imports import _TORCH_GREATER_EQUAL_2_0
 from torch.nn.parallel import DistributedDataParallel
 
 from tests_fabric.helpers.runif import RunIf
@@ -128,7 +127,7 @@ def __instancecheck__(self, instance):
 def test_module_init_context(precision, expected_dtype):
     """Test that the module under the init-context gets moved to the right device and dtype."""
     parallel_devices = [torch.device("cuda", 0), torch.device("cuda", 1)]
-    expected_device = parallel_devices[1] if _TORCH_GREATER_EQUAL_2_0 else torch.device("cpu")
+    expected_device = parallel_devices[1]
 
     strategy = DDPStrategy(
         parallel_devices=parallel_devices, precision=precision, cluster_environment=LightningEnvironment()
diff --git a/tests/tests_fabric/strategies/test_ddp_integration.py b/tests/tests_fabric/strategies/test_ddp_integration.py
index 65eaacde2ff2c..6f003748b9cce 100644
--- a/tests/tests_fabric/strategies/test_ddp_integration.py
+++ b/tests/tests_fabric/strategies/test_ddp_integration.py
@@ -19,7 +19,7 @@
 import pytest
 import torch
 from lightning.fabric import Fabric
-from lightning.fabric.utilities.imports import _TORCH_GREATER_EQUAL_2_0
+from torch._dynamo import OptimizedModule
 from torch.nn.parallel.distributed import DistributedDataParallel
 
 from tests_fabric.helpers.runif import RunIf
@@ -71,15 +71,10 @@ def assert_params_equal(params0, params1):
 
 
 @RunIf(min_cuda_gpus=2, standalone=True, min_torch="2.1.0", dynamo=True)
-@mock.patch(
-    "lightning.fabric.wrappers.torch.compile",
-    Mock(wraps=(torch.compile if _TORCH_GREATER_EQUAL_2_0 else None)),
-)
+@mock.patch("lightning.fabric.wrappers.torch.compile", Mock(wraps=torch.compile))
 @mock.patch.dict(os.environ, {})
 def test_reapply_compile():
     """Test that Fabric can rewrap a compiled module such that compilation happens over the DDP-wrapper."""
-    from torch._dynamo import OptimizedModule
-
     fabric = Fabric(accelerator="cuda", devices=2, strategy="ddp")
     fabric.launch()
 
diff --git a/tests/tests_fabric/strategies/test_fsdp.py b/tests/tests_fabric/strategies/test_fsdp.py
index 3f2d02e06be2a..5eeed7065fb5f 100644
--- a/tests/tests_fabric/strategies/test_fsdp.py
+++ b/tests/tests_fabric/strategies/test_fsdp.py
@@ -33,8 +33,6 @@
 from torch.distributed.fsdp.fully_sharded_data_parallel import CPUOffload, FullyShardedDataParallel, MixedPrecision
 from torch.optim import Adam
 
-from tests_fabric.helpers.runif import RunIf
-
 
 def test_fsdp_custom_mixed_precision():
     """Test that passing a custom mixed precision config works."""
@@ -74,7 +72,6 @@ def test_fsdp_sharding_strategy():
     assert strategy.sharding_strategy == ShardingStrategy.NO_SHARD
 
 
-@RunIf(min_torch="2.0")
 @pytest.mark.parametrize("sharding_strategy", ["HYBRID_SHARD", "_HYBRID_SHARD_ZERO2"])
 def test_fsdp_hybrid_shard_configuration(sharding_strategy):
     """Test that the hybrid sharding strategies can only be used with automatic wrapping or a manually specified pg."""
@@ -108,22 +105,6 @@ def test_fsdp_checkpoint_io_unsupported():
         strategy.checkpoint_io = Mock()
 
 
-@pytest.mark.parametrize("torch_ge_2_0", [False, True])
-def test_fsdp_setup_optimizer_validation(torch_ge_2_0):
-    """Test that `setup_optimizer()` validates the param groups and reference to FSDP parameters."""
-    module = nn.Linear(2, 2)
-    with mock.patch("lightning.fabric.strategies.fsdp._TORCH_GREATER_EQUAL_2_0", torch_ge_2_0):
-        strategy = FSDPStrategy(parallel_devices=[torch.device("cpu")])
-        bad_optimizer = Adam(module.parameters())
-
-        if torch_ge_2_0:
-            strategy.setup_optimizer(bad_optimizer)
-        else:
-            with pytest.raises(ValueError, match="The optimizer does not seem to reference any FSDP parameter"):
-                strategy.setup_optimizer(bad_optimizer)
-
-
-@RunIf(min_torch="2.0.0")
 @mock.patch("lightning.fabric.strategies.fsdp.FSDPStrategy.setup_module")
 def test_fsdp_setup_use_orig_params(_):
     module = nn.Linear(2, 2)
@@ -234,7 +215,6 @@ def test_fsdp_grad_clipping_norm_error():
         strategy.clip_gradients_norm(Mock(), Mock(), Mock())
 
 
-@RunIf(min_torch="2.0.0")
 def test_fsdp_save_checkpoint_storage_options(tmp_path):
     """Test that the FSDP strategy does not accept storage options for saving checkpoints."""
     strategy = FSDPStrategy()
@@ -242,7 +222,6 @@ def test_fsdp_save_checkpoint_storage_options(tmp_path):
         strategy.save_checkpoint(path=tmp_path, state=Mock(), storage_options=Mock())
 
 
-@RunIf(min_torch="2.0.0")
 @mock.patch("lightning.fabric.strategies.fsdp.FSDPStrategy.broadcast", lambda _, x: x)
 @mock.patch("lightning.fabric.strategies.fsdp._get_full_state_dict_context")
 @mock.patch("lightning.fabric.strategies.fsdp._get_sharded_state_dict_context")
@@ -305,7 +284,6 @@ def test_fsdp_save_checkpoint_path_exists(shutil_mock, torch_save_mock, __, ___,
     assert path.is_dir()
 
 
-@RunIf(min_torch="2.0.0")
 @mock.patch("lightning.fabric.strategies.fsdp.FSDPStrategy.broadcast", lambda _, x: x)
 def test_fsdp_save_checkpoint_one_fsdp_module_required(tmp_path):
     """Test that the FSDP strategy can only save one FSDP model per checkpoint."""
@@ -326,7 +304,6 @@ def test_fsdp_save_checkpoint_one_fsdp_module_required(tmp_path):
         strategy.save_checkpoint(path=tmp_path, state={"model1": model1, "model2": model2})
 
 
-@RunIf(min_torch="2.0.0")
 def test_fsdp_load_checkpoint_no_state(tmp_path):
     """Test that the FSDP strategy can't load the full state without access to a model instance from the user."""
     strategy = FSDPStrategy()
@@ -336,7 +313,6 @@ def test_fsdp_load_checkpoint_no_state(tmp_path):
         strategy.load_checkpoint(path=tmp_path, state={})
 
 
-@RunIf(min_torch="2.0.0")
 @mock.patch("lightning.fabric.strategies.fsdp.FSDPStrategy.broadcast", lambda _, x: x)
 @mock.patch("lightning.fabric.strategies.fsdp._lazy_load", Mock())
 def test_fsdp_load_checkpoint_one_fsdp_module_required(tmp_path):
@@ -364,7 +340,6 @@ def test_fsdp_load_checkpoint_one_fsdp_module_required(tmp_path):
     strategy.load_checkpoint(path=path, state=model)
 
 
-@RunIf(min_torch="2.0.0")
 @mock.patch("lightning.fabric.strategies.fsdp.FSDPStrategy.broadcast", lambda _, x: x)
 def test_fsdp_save_checkpoint_unknown_state_dict_type(tmp_path):
     strategy = FSDPStrategy(state_dict_type="invalid")
@@ -374,7 +349,6 @@ def test_fsdp_save_checkpoint_unknown_state_dict_type(tmp_path):
         strategy.save_checkpoint(path=tmp_path, state={"model": model})
 
 
-@RunIf(min_torch="2.0.0")
 def test_fsdp_load_unknown_checkpoint_type(tmp_path):
     """Test that the strategy validates the contents at the checkpoint path."""
     strategy = FSDPStrategy()
@@ -386,7 +360,6 @@ def test_fsdp_load_unknown_checkpoint_type(tmp_path):
         strategy.load_checkpoint(path=path, state={"model": model})
 
 
-@RunIf(min_torch="2.0.0")
 def test_fsdp_load_raw_checkpoint_validate_single_file(tmp_path):
     """Test that we validate the given checkpoint is a single file when loading a raw PyTorch state-dict checkpoint."""
     strategy = FSDPStrategy()
@@ -397,7 +370,6 @@ def test_fsdp_load_raw_checkpoint_validate_single_file(tmp_path):
         strategy.load_checkpoint(path=path, state=model)
 
 
-@RunIf(min_torch="2.0.0")
 def test_fsdp_load_raw_checkpoint_optimizer_unsupported(tmp_path):
     """Validate that the FSDP strategy does not yet support loading the raw PyTorch state-dict for an optimizer."""
     strategy = FSDPStrategy()
@@ -443,7 +415,6 @@ def test_has_meta_device_parameters():
         _has_meta_device_parameters(None)
 
 
-@RunIf(min_torch="2.0")
 @pytest.mark.parametrize("torch_ge_2_1", [True, False])
 @mock.patch("torch.distributed.fsdp.fully_sharded_data_parallel.FullyShardedDataParallel.set_state_dict_type")
 def test_get_full_state_dict_context_offload(set_type_mock, monkeypatch, torch_ge_2_1):
diff --git a/tests/tests_fabric/strategies/test_fsdp_integration.py b/tests/tests_fabric/strategies/test_fsdp_integration.py
index 03d1d0979ea66..88d015c5cd138 100644
--- a/tests/tests_fabric/strategies/test_fsdp_integration.py
+++ b/tests/tests_fabric/strategies/test_fsdp_integration.py
@@ -23,12 +23,10 @@
 from lightning.fabric import Fabric
 from lightning.fabric.plugins import FSDPPrecision
 from lightning.fabric.strategies import FSDPStrategy
-from lightning.fabric.utilities.imports import (
-    _TORCH_GREATER_EQUAL_2_0,
-    _TORCH_GREATER_EQUAL_2_1,
-)
+from lightning.fabric.utilities.imports import _TORCH_GREATER_EQUAL_2_1
 from lightning.fabric.utilities.load import _load_distributed_checkpoint
 from lightning.fabric.wrappers import _FabricOptimizer
+from torch._dynamo import OptimizedModule
 from torch.distributed.fsdp import FlatParameter, FullyShardedDataParallel, OptimStateKeyType
 from torch.distributed.fsdp.wrap import always_wrap_policy, wrap
 from torch.nn import Parameter
@@ -121,7 +119,7 @@ def get_model(self):
         return model
 
 
-@RunIf(min_cuda_gpus=2, standalone=True, min_torch="2.0.0")
+@RunIf(min_cuda_gpus=2, standalone=True)
 @pytest.mark.parametrize("precision", ["16-mixed", pytest.param("bf16-mixed", marks=RunIf(bf16_cuda=True))])
 @pytest.mark.parametrize("manual_wrapping", [True, False])
 def test_fsdp_train_save_load(tmp_path, manual_wrapping, precision):
@@ -176,7 +174,7 @@ def test_fsdp_train_save_load(tmp_path, manual_wrapping, precision):
     assert state["coconut"] == 11
 
 
-@RunIf(min_cuda_gpus=2, standalone=True, min_torch="2.0.0")
+@RunIf(min_cuda_gpus=2, standalone=True)
 def test_fsdp_save_full_state_dict(tmp_path):
     """Test that FSDP saves the full state into a single file with `state_dict_type="full"`."""
     fabric = Fabric(
@@ -290,7 +288,7 @@ def test_fsdp_save_full_state_dict(tmp_path):
     trainer.run()
 
 
-@RunIf(min_cuda_gpus=2, standalone=True, min_torch="2.0.0")
+@RunIf(min_cuda_gpus=2, standalone=True)
 def test_fsdp_load_full_state_dict_into_sharded_model(tmp_path):
     """Test that the strategy can load a full-state checkpoint into a FSDP sharded model."""
     from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
@@ -362,11 +360,7 @@ def test_setup_module_move_to_device(fabric_module_mock, move_to_device):
     # the linear layer got sharded and each part is on the expected device
     assert next(fabric_model.parameters()).device == torch.device("cuda", fabric.local_rank)
     assert next(fabric_model.parameters()).numel() == 50
-    if _TORCH_GREATER_EQUAL_2_0:
-        # In PyTorch >= 2.0 we set `use_orig_params=True` and don't see flattened parameters
-        assert isinstance(next(fabric_model.parameters()), Parameter)
-    else:
-        assert isinstance(next(fabric_model.parameters()), FlatParameter)
+    assert isinstance(next(fabric_model.parameters()), Parameter)
 
     # The _DeviceDtypeModuleMixin currently can't represent the device in a meaningful way for models with pieces on
     # different devices
@@ -374,7 +368,7 @@ def test_setup_module_move_to_device(fabric_module_mock, move_to_device):
     assert fabric.device == torch.device("cuda", fabric.local_rank)
 
 
-@RunIf(min_cuda_gpus=2, skip_windows=True, standalone=True, min_torch="2.0.0")
+@RunIf(min_cuda_gpus=2, skip_windows=True, standalone=True)
 def test_setup_with_orig_params_and_multiple_param_groups():
     """Test that Fabric sets `use_orig_params` for the user when jointly setting up model and optimizer."""
     strategy = FSDPStrategy(auto_wrap_policy=always_wrap_policy)
@@ -407,15 +401,10 @@ def test_setup_with_orig_params_and_multiple_param_groups():
 
 
 @RunIf(min_cuda_gpus=2, standalone=True, min_torch="2.1.0", dynamo=True, skip_windows=True)
-@mock.patch(
-    "lightning.fabric.wrappers.torch.compile",
-    Mock(wraps=(torch.compile if _TORCH_GREATER_EQUAL_2_0 else None)),
-)
+@mock.patch("lightning.fabric.wrappers.torch.compile", Mock(wraps=torch.compile))
 @mock.patch.dict(os.environ, {})
 def test_reapply_compile():
     """Test that Fabric can rewrap a compiled module such that compilation happens over the FSDP-wrapper."""
-    from torch._dynamo import OptimizedModule
-
     strategy = FSDPStrategy(auto_wrap_policy=always_wrap_policy)
     fabric = Fabric(accelerator="cuda", devices=2, strategy=strategy)
     fabric.launch()
@@ -485,7 +474,7 @@ def _run_setup_assertions(empty_init, expected_device):
         _run_setup_assertions(empty_init=True, expected_device=torch.device("cpu"))
 
 
-@RunIf(min_cuda_gpus=2, standalone=True, min_torch="2.0.0")
+@RunIf(min_cuda_gpus=2, standalone=True)
 def test_fsdp_save_filter(tmp_path):
     fabric = Fabric(accelerator="cuda", strategy=FSDPStrategy(state_dict_type="full"), devices=2)
     fabric.launch()
diff --git a/tests/tests_fabric/strategies/test_strategy.py b/tests/tests_fabric/strategies/test_strategy.py
index cbbbf963b3607..a7a1dba87cb97 100644
--- a/tests/tests_fabric/strategies/test_strategy.py
+++ b/tests/tests_fabric/strategies/test_strategy.py
@@ -18,7 +18,6 @@
 import torch
 from lightning.fabric.plugins import DoublePrecision, HalfPrecision, Precision
 from lightning.fabric.strategies import SingleDeviceStrategy
-from lightning.fabric.utilities.imports import _TORCH_GREATER_EQUAL_2_0
 from lightning.fabric.utilities.types import _Stateful
 
 from tests_fabric.helpers.runif import RunIf
@@ -239,8 +238,7 @@ def test_module_init_context(device, precision, dtype, empty_init, monkeypatch):
     with strategy.module_init_context(empty_init=empty_init):
         module = torch.nn.Linear(2, 2)
 
-    expected_device = device if _TORCH_GREATER_EQUAL_2_0 else torch.device("cpu")
-    assert module.weight.device == module.bias.device == expected_device
+    assert module.weight.device == module.bias.device == device
     assert module.weight.dtype == module.bias.dtype == dtype
     if not empty_init:
         init_mock.assert_called()
@@ -274,8 +272,7 @@ def test_tensor_init_context(device, precision, dtype):
         tensor1 = torch.tensor(42)
         tensor2 = torch.tensor(42.0, dtype=torch.half)
 
-    expected_device = device if _TORCH_GREATER_EQUAL_2_0 else torch.device("cpu")
-    assert tensor0.device == tensor1.device == tensor2.device == expected_device
+    assert tensor0.device == tensor1.device == tensor2.device == device
     assert tensor0.dtype == dtype
     assert tensor1.dtype == torch.long  # `.init_tensor()` only affects floating point dtypes
     assert tensor2.dtype == torch.half  # this tensor was created with an explicit dtype assignment
diff --git a/tests/tests_fabric/strategies/test_xla_fsdp.py b/tests/tests_fabric/strategies/test_xla_fsdp.py
index bcd2a6e637417..e2864b684c4a7 100644
--- a/tests/tests_fabric/strategies/test_xla_fsdp.py
+++ b/tests/tests_fabric/strategies/test_xla_fsdp.py
@@ -27,7 +27,7 @@
 from tests_fabric.helpers.runif import RunIf
 
 
-@RunIf(min_torch="2.0", tpu=True)
+@RunIf(tpu=True)
 def test_xla_fsdp_setup_optimizer_validation():
     """Test that `setup_optimizer()` validates the param groups and reference to FSDP parameters."""
     module = nn.Linear(2, 2)
@@ -39,7 +39,7 @@ def test_xla_fsdp_setup_optimizer_validation():
         strategy.setup_optimizer(bad_optimizer)
 
 
-@RunIf(min_torch="2.0", tpu=True)
+@RunIf(tpu=True)
 def test_xla_fsdp_no_backward_sync():
     """Test that the backward sync control calls `.no_sync()`, and only on a module wrapped in
     XlaFullyShardedDataParallel."""
@@ -64,7 +64,7 @@ def test_xla_fsdp_no_backward_sync():
     module.no_sync.assert_called_once()
 
 
-@RunIf(min_torch="2.0", tpu=True)
+@RunIf(tpu=True)
 def test_xla_fsdp_grad_clipping_value_error():
     strategy = XLAFSDPStrategy()
     with pytest.raises(NotImplementedError, match="does not support to clip gradients by value"):
diff --git a/tests/tests_fabric/strategies/test_xla_fsdp_integration.py b/tests/tests_fabric/strategies/test_xla_fsdp_integration.py
index 999b8473b28aa..20c2ef042272e 100644
--- a/tests/tests_fabric/strategies/test_xla_fsdp_integration.py
+++ b/tests/tests_fabric/strategies/test_xla_fsdp_integration.py
@@ -45,7 +45,7 @@ def _xla_fsdp_rewrap_warning(fabric: Fabric):
     assert isinstance(model._forward_module[2], XlaFullyShardedDataParallel)
 
 
-@RunIf(min_torch="2.0", tpu=True, standalone=True)
+@RunIf(tpu=True, standalone=True)
 def test_xla_fsdp_rewrap_warning():
     """Test that XLAFSDP warns about rewrapping the modules."""
     from torch_xla.distributed.fsdp.wrap import always_wrap_policy
@@ -159,7 +159,7 @@ def step(model, batch):
                 torch.testing.assert_close(p0, p1, atol=0, rtol=0, equal_nan=True)
 
 
-@RunIf(min_torch="2.0", tpu=True, standalone=True)
+@RunIf(tpu=True, standalone=True)
 @pytest.mark.parametrize(
     ("use_auto_wrap_policy", "state_dict_type", "sequential_save"),
     [
@@ -196,7 +196,7 @@ def _test_setup_module_move_to_device(fabric, move_to_device):
     assert fabric.device.type == "xla"
 
 
-@RunIf(min_torch="2.0", tpu=True, standalone=True)
+@RunIf(tpu=True, standalone=True)
 @pytest.mark.parametrize("move_to_device", [True, False])
 def test_setup_module_move_to_device(move_to_device):
     """Test that `move_to_device` does nothing, FSDP decides which device parameters get moved to which device
diff --git a/tests/tests_fabric/test_fabric.py b/tests/tests_fabric/test_fabric.py
index fde9479c73eaf..f76a846e80a75 100644
--- a/tests/tests_fabric/test_fabric.py
+++ b/tests/tests_fabric/test_fabric.py
@@ -623,7 +623,7 @@ def test_backward():
         ("auto", "32-true", False),
         ("auto", "bf16-true", False),
         ("auto", "bf16-mixed", True),
-        pytest.param("fsdp", "32-true", True, marks=RunIf(min_cuda_gpus=1, min_torch="2.0.0")),
+        pytest.param("fsdp", "32-true", True, marks=RunIf(min_cuda_gpus=1)),
     ],
 )
 @pytest.mark.parametrize("setup_method", ["setup", "setup_module"])
@@ -855,7 +855,6 @@ def test_module_sharding_context():
 
 def test_init_module_context(monkeypatch):
     """Test that the strategy returns the context manager for initializing the module."""
-    import lightning.fabric
 
     fabric = Fabric(accelerator="cpu")
     strategy = SingleDeviceStrategy(device=torch.device("cuda"))
@@ -866,17 +865,9 @@ def test_init_module_context(monkeypatch):
     strategy.module_init_context.assert_called_once_with(empty_init=None)
     strategy.module_init_context.reset_mock()
 
-    # Pretend we are using PyTorch < 2.0
-    monkeypatch.setattr(lightning.fabric.fabric, "_TORCH_GREATER_EQUAL_2_0", False)
-    with pytest.warns(PossibleUserWarning, match="can't place the model parameters on the device"):  # noqa: SIM117
-        with fabric.init_module():
-            pass
-    strategy.module_init_context.assert_called_once()
-
 
 def test_init_tensor_context(monkeypatch):
     """Test that `.init_tensor()` warns if using PyTorch < 2.0."""
-    import lightning.fabric
 
     fabric = Fabric(accelerator="cpu")
     strategy = SingleDeviceStrategy(device=torch.device("cuda"))
@@ -887,13 +878,6 @@ def test_init_tensor_context(monkeypatch):
     strategy.tensor_init_context.assert_called_once()
     strategy.tensor_init_context.reset_mock()
 
-    # Pretend we are using PyTorch < 2.0
-    monkeypatch.setattr(lightning.fabric.fabric, "_TORCH_GREATER_EQUAL_2_0", False)
-    with pytest.warns(PossibleUserWarning, match="can't place tensors on the device directly"):  # noqa: SIM117
-        with fabric.init_tensor():
-            pass
-    strategy.tensor_init_context.assert_called_once()
-
 
 def test_callbacks_input():
     """Test the various ways in which callbacks can be registered with Fabric."""
diff --git a/tests/tests_fabric/test_wrappers.py b/tests/tests_fabric/test_wrappers.py
index 0923c601d51c3..599d8f085d16c 100644
--- a/tests/tests_fabric/test_wrappers.py
+++ b/tests/tests_fabric/test_wrappers.py
@@ -28,6 +28,7 @@
     _unwrap_objects,
     is_wrapped,
 )
+from torch._dynamo import OptimizedModule
 from torch.utils.data import BatchSampler, DistributedSampler
 from torch.utils.data.dataloader import DataLoader
 
@@ -492,8 +493,6 @@ def test_is_wrapped(compile):
 
     # _FabricModule inside an OptimizedModule
     if compile:
-        from torch._dynamo import OptimizedModule
-
         module = torch.nn.Linear(2, 2)
         wrapped = torch.compile(_FabricModule(module, Mock()))
         assert isinstance(wrapped, OptimizedModule)
@@ -624,11 +623,6 @@ def test_unwrap_compiled():
     # We wrap `torch.compile` on import of lightning in `wrappers.py`
     assert torch.compile.__wrapped__
 
-    with mock.patch("lightning.fabric.wrappers", "_TORCH_GREATER_EQUAL_2_0", False):
-        unwrapped, compile_kwargs = _unwrap_compiled(model)
-    assert unwrapped is model
-    assert compile_kwargs is None
-
     compiled = torch.compile(model, fullgraph=True, dynamic=True, disable=False)
     assert compiled._compile_kwargs == {"fullgraph": True, "dynamic": True, "disable": False}
     unwrapped, compile_kwargs = _unwrap_compiled(compiled)
diff --git a/tests/tests_fabric/utilities/test_load.py b/tests/tests_fabric/utilities/test_load.py
index 574f8bf36247b..c53686ceb9a26 100644
--- a/tests/tests_fabric/utilities/test_load.py
+++ b/tests/tests_fabric/utilities/test_load.py
@@ -21,10 +21,7 @@
     _NotYetLoadedTensor,
 )
 
-from tests_fabric.helpers.runif import RunIf
 
-
-@RunIf(min_torch="2.0.0")
 def test_lazy_load_module(tmp_path):
     model0 = nn.Linear(2, 2)
     torch.save(model0.state_dict(), tmp_path / "model.pt")
@@ -43,7 +40,6 @@ class ATensor(torch.Tensor):
     pass
 
 
-@RunIf(min_torch="2.0.0")
 def test_lazy_load_tensor(tmp_path):
     """Test that lazy load can handle different classes of tensors."""
     expected = {
@@ -61,7 +57,6 @@ def test_lazy_load_tensor(tmp_path):
         assert torch.equal(t0, t1_materialized)
 
 
-@RunIf(min_torch="2.0.0")
 def test_lazy_load_mixed_state(tmp_path):
     model0 = nn.Linear(2, 2)
     optim0 = torch.optim.Adam(model0.parameters())
@@ -82,13 +77,11 @@ def test_lazy_load_mixed_state(tmp_path):
     optim1.load_state_dict(loaded_checkpoint["optimizer"])
 
 
-@RunIf(min_torch="2.0.0")
 def test_lazy_load_raises():
     with pytest.raises(FileNotFoundError, match="foo' does not exist"):
         _lazy_load("foo")
 
 
-@RunIf(min_torch="2.0.0")
 def test_materialize_tensors(tmp_path):
     # Single tensor
     tensor = torch.tensor([1, 2])
diff --git a/tests/tests_pytorch/core/test_lightning_module.py b/tests/tests_pytorch/core/test_lightning_module.py
index d5aec835ad581..5ee91e82689f4 100644
--- a/tests/tests_pytorch/core/test_lightning_module.py
+++ b/tests/tests_pytorch/core/test_lightning_module.py
@@ -18,7 +18,6 @@
 import pytest
 import torch
 from lightning.fabric import Fabric
-from lightning.fabric.utilities.imports import _TORCH_GREATER_EQUAL_2_0
 from lightning.pytorch import LightningModule, Trainer
 from lightning.pytorch.core.module import _TrainerFabricShim
 from lightning.pytorch.demos.boring_classes import BoringModel
@@ -444,9 +443,6 @@ def test_trainer_reference_recursively():
     ensemble.trainer = trainer
     # references match
     assert ensemble.trainer is inner.trainer
-    if not _TORCH_GREATER_EQUAL_2_0:
-        # and the trainer was weakly referenced
-        assert inner.trainer is weakref.proxy(trainer)
 
 
 def test_fabric_reference_recursively():
diff --git a/tests/tests_pytorch/models/test_hooks.py b/tests/tests_pytorch/models/test_hooks.py
index aa56e8ca02ba4..763a140982a8d 100644
--- a/tests/tests_pytorch/models/test_hooks.py
+++ b/tests/tests_pytorch/models/test_hooks.py
@@ -18,7 +18,6 @@
 
 import pytest
 import torch
-from lightning.fabric.utilities.imports import _TORCH_GREATER_EQUAL_2_0
 from lightning.pytorch import Callback, LightningDataModule, LightningModule, Trainer, __version__
 from lightning.pytorch.demos.boring_classes import BoringDataModule, BoringModel, RandomDataset
 from lightning.pytorch.utilities.model_helpers import is_overridden
@@ -479,7 +478,7 @@ def training_step(self, batch, batch_idx):
         {"name": "configure_optimizers"},
         {"name": "Callback.on_fit_start", "args": (trainer, model)},
         {"name": "on_fit_start"},
-        {"name": "zero_grad", **({} if _TORCH_GREATER_EQUAL_2_0 else {"kwargs": {"set_to_none": True}})},
+        {"name": "zero_grad"},
         {"name": "Callback.on_sanity_check_start", "args": (trainer, model)},
         {"name": "val_dataloader"},
         {"name": "train", "args": (False,)},
@@ -497,7 +496,7 @@ def training_step(self, batch, batch_idx):
         {"name": "Callback.on_train_epoch_start", "args": (trainer, model)},
         {"name": "on_train_epoch_start"},
         *model._train_batch(trainer, model, train_batches, device=device, **kwargs),
-        {"name": "zero_grad", **({} if _TORCH_GREATER_EQUAL_2_0 else {"kwargs": {"set_to_none": True}})},
+        {"name": "zero_grad"},
         {"name": "on_validation_model_zero_grad"},
         {"name": "train", "args": (False,)},
         {"name": "on_validation_model_eval"},
@@ -577,7 +576,7 @@ def test_trainer_model_hook_system_fit_no_val_and_resume_max_epochs(tmp_path):
         {"name": "configure_optimizers"},
         {"name": "Callback.on_fit_start", "args": (trainer, model)},
         {"name": "on_fit_start"},
-        {"name": "zero_grad", **({} if _TORCH_GREATER_EQUAL_2_0 else {"kwargs": {"set_to_none": True}})},
+        {"name": "zero_grad"},
         {"name": "train_dataloader"},
         {"name": "Callback.on_train_start", "args": (trainer, model)},
         {"name": "on_train_start"},
@@ -655,7 +654,7 @@ def test_trainer_model_hook_system_fit_no_val_and_resume_max_steps(tmp_path):
         {"name": "configure_optimizers"},
         {"name": "Callback.on_fit_start", "args": (trainer, model)},
         {"name": "on_fit_start"},
-        {"name": "zero_grad", **({} if _TORCH_GREATER_EQUAL_2_0 else {"kwargs": {"set_to_none": True}})},
+        {"name": "zero_grad"},
         {"name": "train_dataloader"},
         {"name": "Callback.on_train_start", "args": (trainer, model)},
         {"name": "on_train_start"},
@@ -718,7 +717,7 @@ def test_trainer_model_hook_system_eval(tmp_path, override_on_x_model_train, bat
         {"name": "Callback.setup", "args": (trainer, model), "kwargs": {"stage": verb}},
         {"name": "setup", "kwargs": {"stage": verb}},
         {"name": "configure_model"},
-        {"name": "zero_grad", **({} if _TORCH_GREATER_EQUAL_2_0 else {"kwargs": {"set_to_none": True}})},
+        {"name": "zero_grad"},
         *(hooks if batches else []),
         {"name": "Callback.teardown", "args": (trainer, model), "kwargs": {"stage": verb}},
         {"name": "teardown", "kwargs": {"stage": verb}},
@@ -741,7 +740,7 @@ def test_trainer_model_hook_system_predict(tmp_path):
         {"name": "Callback.setup", "args": (trainer, model), "kwargs": {"stage": "predict"}},
         {"name": "setup", "kwargs": {"stage": "predict"}},
         {"name": "configure_model"},
-        {"name": "zero_grad", **({} if _TORCH_GREATER_EQUAL_2_0 else {"kwargs": {"set_to_none": True}})},
+        {"name": "zero_grad"},
         {"name": "predict_dataloader"},
         {"name": "train", "args": (False,)},
         {"name": "on_predict_model_eval"},
diff --git a/tests/tests_pytorch/plugins/precision/test_fsdp.py b/tests/tests_pytorch/plugins/precision/test_fsdp.py
index e4d652cb15864..6b19fdabdf6d6 100644
--- a/tests/tests_pytorch/plugins/precision/test_fsdp.py
+++ b/tests/tests_pytorch/plugins/precision/test_fsdp.py
@@ -26,25 +26,9 @@
     [
         ("16-true", (torch.float16, torch.float16, torch.float16)),
         ("bf16-true", (torch.bfloat16, torch.bfloat16, torch.bfloat16)),
-        pytest.param(
-            "16-mixed", (torch.float32, torch.float16, torch.float16), marks=RunIf(min_torch="2.0"), id="16-mixed-ge2_0"
-        ),
-        pytest.param(
-            "16-mixed", (None, torch.float16, torch.float16), marks=RunIf(max_torch="2.0"), id="16-mixed-lt2_0"
-        ),
-        pytest.param(
-            "bf16-mixed",
-            (torch.float32, torch.bfloat16, torch.bfloat16),
-            marks=RunIf(min_torch="2.0"),
-            id="bf16-mixed-ge2_0",
-        ),
-        pytest.param(
-            "bf16-mixed", (None, torch.bfloat16, torch.bfloat16), marks=RunIf(max_torch="2.0"), id="bf16-mixed-lt2_0"
-        ),
-        pytest.param(
-            "32-true", (torch.float32, torch.float32, torch.float32), marks=RunIf(min_torch="2.0"), id="32-true-ge2_0"
-        ),
-        pytest.param("32-true", (None, torch.float32, torch.float32), marks=RunIf(max_torch="2.0"), id="32-true-lt2_0"),
+        ("16-mixed", (torch.float32, torch.float16, torch.float16)),
+        ("bf16-mixed", (torch.float32, torch.bfloat16, torch.bfloat16)),
+        ("32-true", (torch.float32, torch.float32, torch.float32)),
     ],
 )
 def test_fsdp_precision_config(precision, expected):
diff --git a/tests/tests_pytorch/strategies/test_common.py b/tests/tests_pytorch/strategies/test_common.py
index f352ead871102..699424b3c53b9 100644
--- a/tests/tests_pytorch/strategies/test_common.py
+++ b/tests/tests_pytorch/strategies/test_common.py
@@ -15,7 +15,6 @@
 
 import pytest
 import torch
-from lightning.fabric.utilities.imports import _TORCH_GREATER_EQUAL_2_0
 from lightning.pytorch import Trainer
 from lightning.pytorch.plugins import DoublePrecision, HalfPrecision, Precision
 from lightning.pytorch.strategies import SingleDeviceStrategy
@@ -82,8 +81,7 @@ def test_module_init_context(device, precision, dtype, empty_init, monkeypatch):
     with strategy.tensor_init_context(empty_init=empty_init):
         module = torch.nn.Linear(2, 2)
 
-    expected_device = device if _TORCH_GREATER_EQUAL_2_0 else torch.device("cpu")
-    assert module.weight.device == module.bias.device == expected_device
+    assert module.weight.device == module.bias.device == device
     assert module.weight.dtype == module.bias.dtype == dtype
     if not empty_init:
         init_mock.assert_called()
diff --git a/tests/tests_pytorch/strategies/test_ddp.py b/tests/tests_pytorch/strategies/test_ddp.py
index dadd49c359e06..b23d306b9d907 100644
--- a/tests/tests_pytorch/strategies/test_ddp.py
+++ b/tests/tests_pytorch/strategies/test_ddp.py
@@ -18,7 +18,6 @@
 import pytest
 import torch
 from lightning.fabric.plugins.environments import LightningEnvironment
-from lightning.fabric.utilities.imports import _TORCH_GREATER_EQUAL_2_0
 from lightning.pytorch import LightningModule, Trainer
 from lightning.pytorch.demos.boring_classes import BoringModel
 from lightning.pytorch.plugins import DoublePrecision, HalfPrecision, Precision
@@ -102,7 +101,7 @@ def test_ddp_kwargs_from_registry(strategy_name, expected_ddp_kwargs, mps_count_
 def test_tensor_init_context(precision_plugin, expected_dtype):
     """Test that the module under the init-context gets moved to the right device and dtype."""
     parallel_devices = [torch.device("cuda", 0), torch.device("cuda", 1)]
-    expected_device = parallel_devices[1] if _TORCH_GREATER_EQUAL_2_0 else torch.device("cpu")
+    expected_device = parallel_devices[1]
 
     strategy = DDPStrategy(
         parallel_devices=parallel_devices, precision_plugin=precision_plugin, cluster_environment=LightningEnvironment()
diff --git a/tests/tests_pytorch/strategies/test_ddp_integration.py b/tests/tests_pytorch/strategies/test_ddp_integration.py
index 0b841cde8de67..17135a98fc089 100644
--- a/tests/tests_pytorch/strategies/test_ddp_integration.py
+++ b/tests/tests_pytorch/strategies/test_ddp_integration.py
@@ -20,7 +20,6 @@
 import torch
 from lightning.fabric.plugins.environments import ClusterEnvironment, LightningEnvironment
 from lightning.fabric.utilities.distributed import _distributed_is_initialized
-from lightning.fabric.utilities.imports import _TORCH_GREATER_EQUAL_2_0
 from lightning.pytorch import Trainer
 from lightning.pytorch.callbacks import Callback, EarlyStopping
 from lightning.pytorch.demos.boring_classes import BoringDataModule, BoringModel
@@ -112,9 +111,7 @@ class CustomCallback(Callback):
         def on_train_start(self, trainer: "pl.Trainer", pl_module: "pl.LightningModule") -> None:
             assert isinstance(trainer.strategy.model, DistributedDataParallel)
             expected = ["something"]
-            assert (
-                trainer.strategy.model.parameters_to_ignore == set(expected) if _TORCH_GREATER_EQUAL_2_0 else expected
-            )
+            assert trainer.strategy.model.parameters_to_ignore == set(expected)
             assert trainer.strategy.model.module._ddp_params_and_buffers_to_ignore == expected
 
     model = CustomModel()
diff --git a/tests/tests_pytorch/strategies/test_fsdp.py b/tests/tests_pytorch/strategies/test_fsdp.py
index 413a5e6c9dddd..e27aec2e7989a 100644
--- a/tests/tests_pytorch/strategies/test_fsdp.py
+++ b/tests/tests_pytorch/strategies/test_fsdp.py
@@ -14,11 +14,7 @@
 import torch.nn as nn
 from lightning.fabric.plugins.environments import LightningEnvironment
 from lightning.fabric.strategies.fsdp import _is_sharded_checkpoint
-from lightning.fabric.utilities.imports import (
-    _TORCH_GREATER_EQUAL_2_0,
-    _TORCH_GREATER_EQUAL_2_1,
-    _TORCH_GREATER_EQUAL_2_2,
-)
+from lightning.fabric.utilities.imports import _TORCH_GREATER_EQUAL_2_1, _TORCH_GREATER_EQUAL_2_2
 from lightning.fabric.utilities.load import _load_distributed_checkpoint
 from lightning.pytorch import Trainer
 from lightning.pytorch.callbacks import ModelCheckpoint
@@ -29,16 +25,11 @@
 from lightning.pytorch.trainer.states import TrainerFn
 from lightning.pytorch.utilities.consolidate_checkpoint import _format_checkpoint
 from torch.distributed.fsdp.fully_sharded_data_parallel import CPUOffload, FullyShardedDataParallel, MixedPrecision
-from torch.distributed.fsdp.wrap import always_wrap_policy, size_based_auto_wrap_policy, wrap
+from torch.distributed.fsdp.wrap import ModuleWrapPolicy, always_wrap_policy, size_based_auto_wrap_policy, wrap
 from torchmetrics import Accuracy
 
 from tests_pytorch.helpers.runif import RunIf
 
-if _TORCH_GREATER_EQUAL_2_0:
-    from torch.distributed.fsdp.wrap import ModuleWrapPolicy
-else:
-    ModuleWrapPolicy = object
-
 
 class TestFSDPModel(BoringModel):
     def __init__(self):
@@ -87,10 +78,10 @@ def _assert_layer_fsdp_instance(self) -> None:
         assert isinstance(self.trainer.strategy.precision_plugin, FSDPPrecision)
 
         if self.trainer.precision == "16-mixed":
-            param_dtype = None if not _TORCH_GREATER_EQUAL_2_0 else torch.float32
+            param_dtype = torch.float32
             reduce_dtype = buffer_dtype = torch.float16
         elif self.trainer.precision == "bf16-mixed":
-            param_dtype = None if not _TORCH_GREATER_EQUAL_2_0 else torch.float32
+            param_dtype = torch.float32
             reduce_dtype = buffer_dtype = torch.bfloat16
         elif self.trainer.precision == "16-true":
             param_dtype = reduce_dtype = buffer_dtype = torch.float16
@@ -119,10 +110,8 @@ def __init__(self, wrap_min_params: int = 2):
         self.should_be_wrapped = [wrap_min_params < (32 * 32 + 32), None, wrap_min_params < (32 * 2 + 2)]
 
     def configure_optimizers(self):
-        parameters = self.parameters() if _TORCH_GREATER_EQUAL_2_0 else self.trainer.model.parameters()
-
         # SGD's FSDP optimier state is fixed in https://github.com/pytorch/pytorch/pull/99214
-        return torch.optim.AdamW(parameters, lr=0.1)
+        return torch.optim.AdamW(self.parameters(), lr=0.1)
 
 
 class TestFSDPModelAutoWrapped(TestBoringModel):
@@ -150,10 +139,10 @@ def _assert_layer_fsdp_instance(self) -> None:
         assert isinstance(self.trainer.strategy.precision_plugin, FSDPPrecision)
 
         if self.trainer.precision == "16-mixed":
-            param_dtype = None if not _TORCH_GREATER_EQUAL_2_0 else torch.float32
+            param_dtype = torch.float32
             reduce_dtype = buffer_dtype = torch.float16
         elif self.trainer.precision == "bf16-mixed":
-            param_dtype = None if not _TORCH_GREATER_EQUAL_2_0 else torch.float32
+            param_dtype = torch.float32
             reduce_dtype = buffer_dtype = torch.bfloat16
         elif self.trainer.precision == "16-true":
             param_dtype = reduce_dtype = buffer_dtype = torch.float16
@@ -287,23 +276,12 @@ def test_fsdp_strategy_checkpoint(state_dict_type, precision, tmp_path):
     _run_multiple_stages(trainer, model, os.path.join(tmp_path, "last.ckpt"))
 
 
-if _TORCH_GREATER_EQUAL_2_0:
-
-    def custom_auto_wrap_policy(
-        module,
-        recurse,
-        nonwrapped_numel: int,
-    ) -> bool:
-        return nonwrapped_numel >= 2
-
-else:
-
-    def custom_auto_wrap_policy(
-        module,
-        recurse,
-        unwrapped_params: int,
-    ) -> bool:
-        return unwrapped_params >= 2
+def custom_auto_wrap_policy(
+    module,
+    recurse,
+    nonwrapped_numel: int,
+) -> bool:
+    return nonwrapped_numel >= 2
 
 
 @RunIf(min_cuda_gpus=2, skip_windows=True, standalone=True)
@@ -350,14 +328,6 @@ def test_fsdp_strategy_full_state_dict(tmp_path, wrap_min_params):
             TestFSDPModelAutoWrapped(),
             FSDPStrategy,
             {"auto_wrap_policy": custom_auto_wrap_policy},
-            marks=RunIf(max_torch="2.0.0"),
-            id="autowrap_1x",
-        ),
-        pytest.param(
-            TestFSDPModelAutoWrapped(),
-            FSDPStrategy,
-            {"auto_wrap_policy": custom_auto_wrap_policy},
-            marks=RunIf(min_torch="2.0.0"),
             id="autowrap_2x",
         ),
         pytest.param(
@@ -400,7 +370,7 @@ def test_fsdp_checkpoint_multi_gpus(tmp_path, model, strategy, strategy_cfg):
 @pytest.mark.parametrize("use_orig_params", [None, False, True])
 def test_invalid_parameters_in_optimizer(use_orig_params):
     fsdp_kwargs = {}
-    if _TORCH_GREATER_EQUAL_2_0 and use_orig_params is not None:
+    if use_orig_params is not None:
         fsdp_kwargs = {"use_orig_params": use_orig_params}
 
     trainer = Trainer(
@@ -412,7 +382,7 @@ def test_invalid_parameters_in_optimizer(use_orig_params):
 
     error_context = (
         nullcontext()
-        if _TORCH_GREATER_EQUAL_2_0 and (_TORCH_GREATER_EQUAL_2_1 or use_orig_params is not False)
+        if _TORCH_GREATER_EQUAL_2_1 or use_orig_params is not False
         else pytest.raises(ValueError, match="The optimizer does not seem to reference any FSDP parameters")
     )
 
@@ -431,7 +401,7 @@ def configure_optimizers(self):
 
     error_context = (
         nullcontext()
-        if _TORCH_GREATER_EQUAL_2_0 and use_orig_params is not False
+        if use_orig_params is not False
         else pytest.raises(ValueError, match="The optimizer does not seem to reference any FSDP parameters")
     )
 
@@ -530,7 +500,6 @@ def test_fsdp_sharding_strategy():
     assert strategy.sharding_strategy == ShardingStrategy.NO_SHARD
 
 
-@RunIf(min_torch="2.0")
 @pytest.mark.parametrize("sharding_strategy", ["HYBRID_SHARD", "_HYBRID_SHARD_ZERO2"])
 def test_fsdp_hybrid_sharding_strategy(sharding_strategy):
     """Test that the hybrid sharding strategies can only be used with automatic wrapping or a manually specified pg."""
@@ -555,16 +524,11 @@ def test_fsdp_hybrid_sharding_strategy(sharding_strategy):
 
 
 def test_fsdp_use_orig_params():
-    """Test that Lightning enables `use_orig_params` in PyTorch >= 2.0."""
-    with mock.patch("lightning.pytorch.strategies.fsdp._TORCH_GREATER_EQUAL_2_0", False):
-        strategy = FSDPStrategy()
-        assert "use_orig_params" not in strategy.kwargs
-
-    with mock.patch("lightning.pytorch.strategies.fsdp._TORCH_GREATER_EQUAL_2_0", True):
-        strategy = FSDPStrategy()
-        assert strategy.kwargs["use_orig_params"]
-        strategy = FSDPStrategy(use_orig_params=False)
-        assert not strategy.kwargs["use_orig_params"]
+    """Test that Lightning enables `use_orig_params` automatically."""
+    strategy = FSDPStrategy()
+    assert strategy.kwargs["use_orig_params"]
+    strategy = FSDPStrategy(use_orig_params=False)
+    assert not strategy.kwargs["use_orig_params"]
 
 
 @mock.patch("torch.distributed.init_process_group")
@@ -583,7 +547,6 @@ def test_set_timeout(init_process_group_mock):
     )
 
 
-@RunIf(min_torch="2.0")
 @mock.patch("lightning.pytorch.strategies.fsdp._load_raw_module_state")
 def test_fsdp_strategy_load_optimizer_states_multiple(_, tmp_path):
     strategy = FSDPStrategy(parallel_devices=[torch.device("cpu")], state_dict_type="full")
@@ -640,12 +603,9 @@ def test_fsdp_strategy_save_optimizer_states(tmp_path, wrap_min_params):
     if trainer.global_rank != 0:
         assert len(model_state_dict) == 0
 
-    if trainer.global_rank != 0 and _TORCH_GREATER_EQUAL_2_1 or not _TORCH_GREATER_EQUAL_2_0:
+    if trainer.global_rank != 0 and _TORCH_GREATER_EQUAL_2_1:
         assert len(optimizer_state_dict) == 0
 
-    if not _TORCH_GREATER_EQUAL_2_0:
-        return
-
     # restore model to ddp
     model = TestBoringModel()
     trainer = Trainer(default_root_dir=tmp_path, accelerator="gpu", devices=2, strategy="ddp", max_epochs=1)
@@ -714,10 +674,10 @@ def test_fsdp_strategy_load_optimizer_states(tmp_path, wrap_min_params):
     if trainer.global_rank != 0:
         assert len(restored_model_state_dict) == 0
 
-    if trainer.global_rank != 0 and _TORCH_GREATER_EQUAL_2_1 or not _TORCH_GREATER_EQUAL_2_0:
+    if trainer.global_rank != 0 and _TORCH_GREATER_EQUAL_2_1:
         assert len(restored_optimizer_state_dict) == 0
 
-    if trainer.global_rank == 0 and _TORCH_GREATER_EQUAL_2_0:
+    if trainer.global_rank == 0:
         # assert everything is the same
         assert len(model_state_dict) == len(restored_model_state_dict)
         assert len(optimizer_state_dict) == len(restored_optimizer_state_dict)
@@ -766,33 +726,6 @@ def on_fit_start(self):
     trainer.fit(model)
 
 
-@mock.patch("lightning.pytorch.strategies.fsdp._TORCH_GREATER_EQUAL_2_0", False)
-@mock.patch("lightning.pytorch.strategies.fsdp.torch.load")
-@mock.patch("lightning.pytorch.strategies.fsdp._load_raw_module_state")
-def test_load_save_optimizer_torch_lt_2_0(_, __, tmp_path):
-    strategy = FSDPStrategy(state_dict_type="full")
-    with pytest.warns(UserWarning, match="does not support saving the optimizer state"):
-        strategy.optimizer_state(Mock())
-
-    file = tmp_path / "test.ckpt"
-    file.touch()
-    trainer = Trainer()
-    trainer.state.fn = TrainerFn.FITTING
-    strategy._lightning_module = Mock(trainer=trainer)
-    with pytest.warns(UserWarning, match="does not support loading the optimizer state"):
-        strategy.load_checkpoint(file)
-
-
-@mock.patch("lightning.pytorch.strategies.fsdp._TORCH_GREATER_EQUAL_2_0", False)
-def test_sharded_state_dict_type_support():
-    """Test that the sharded state dict type is supported."""
-    with pytest.raises(
-        NotImplementedError,
-        match=escape("`FSDPStrategy(state_dict_type='sharded')` is not supported in PyTorch < 2.0"),
-    ):
-        FSDPStrategy(state_dict_type="sharded")
-
-
 def test_save_checkpoint_storage_options(tmp_path):
     """Test that the FSDP strategy does not accept storage options for saving checkpoints."""
     strategy = FSDPStrategy()
@@ -800,7 +733,6 @@ def test_save_checkpoint_storage_options(tmp_path):
         strategy.save_checkpoint(filepath=tmp_path, checkpoint=Mock(), storage_options=Mock())
 
 
-@RunIf(min_torch="2.0.0")
 @mock.patch("lightning.pytorch.strategies.fsdp.FSDPStrategy.broadcast", lambda _, x: x)
 @mock.patch("lightning.pytorch.strategies.fsdp._get_full_state_dict_context")
 @mock.patch("lightning.pytorch.strategies.fsdp._get_sharded_state_dict_context")
@@ -899,7 +831,7 @@ def on_train_start(self):
             torch.testing.assert_close(p0, p1, atol=0, rtol=0, equal_nan=True)
 
 
-@RunIf(min_cuda_gpus=2, standalone=True, min_torch="2.0.0")
+@RunIf(min_cuda_gpus=2, standalone=True)
 def test_save_load_sharded_state_dict(tmp_path):
     """Test FSDP saving and loading with the sharded state dict format."""
     strategy = FSDPStrategy(auto_wrap_policy={nn.Linear}, state_dict_type="sharded")
@@ -955,10 +887,7 @@ def test_fsdp_lazy_load_full_state_dict(_, lazy_load_mock, torch_load_mock, tmp_
     file.touch()
 
     strategy.load_checkpoint(checkpoint_path=file)
-    if _TORCH_GREATER_EQUAL_2_0:
-        lazy_load_mock.assert_called_once()
-    else:
-        torch_load_mock.assert_called_once()
+    lazy_load_mock.assert_called_once()
 
 
 @RunIf(min_cuda_gpus=2, skip_windows=True, standalone=True)
diff --git a/tests/tests_pytorch/trainer/optimization/test_manual_optimization.py b/tests/tests_pytorch/trainer/optimization/test_manual_optimization.py
index 6a9123f2980a6..449a1d72ed3a0 100644
--- a/tests/tests_pytorch/trainer/optimization/test_manual_optimization.py
+++ b/tests/tests_pytorch/trainer/optimization/test_manual_optimization.py
@@ -22,7 +22,6 @@
 import torch.distributed as torch_distrib
 import torch.nn.functional as F
 from lightning.fabric.utilities.exceptions import MisconfigurationException
-from lightning.fabric.utilities.imports import _TORCH_GREATER_EQUAL_2_0
 from lightning.pytorch import Trainer, seed_everything
 from lightning.pytorch.demos.boring_classes import BoringModel, ManualOptimBoringModel
 from lightning.pytorch.strategies import Strategy
@@ -31,11 +30,7 @@
 
 
 def assert_emtpy_grad(grad):
-    if _TORCH_GREATER_EQUAL_2_0:
-        assert grad is None
-    else:
-        if grad is not None:  # backward has been called
-            assert torch.all(grad == 0)
+    assert grad is None
 
 
 class ManualOptModel(BoringModel):
diff --git a/tests/tests_pytorch/trainer/test_trainer.py b/tests/tests_pytorch/trainer/test_trainer.py
index 565971e1554b1..c4af0d37453ee 100644
--- a/tests/tests_pytorch/trainer/test_trainer.py
+++ b/tests/tests_pytorch/trainer/test_trainer.py
@@ -24,8 +24,6 @@
 from unittest.mock import ANY, Mock, call, patch
 
 import cloudpickle
-import lightning.fabric
-import lightning.pytorch
 import pytest
 import torch
 import torch.nn as nn
@@ -51,7 +49,6 @@
 from lightning.pytorch.trainer.states import RunningStage, TrainerFn
 from lightning.pytorch.utilities.exceptions import MisconfigurationException
 from lightning.pytorch.utilities.imports import _OMEGACONF_AVAILABLE
-from lightning.pytorch.utilities.warnings import PossibleUserWarning
 from torch.multiprocessing import ProcessRaisedException
 from torch.nn.parallel.distributed import DistributedDataParallel
 from torch.optim import SGD
@@ -2080,12 +2077,6 @@ def test_init_module_context(monkeypatch):
     strategy.tensor_init_context.assert_called_once_with(empty_init=None)
     strategy.tensor_init_context.reset_mock()
 
-    # Pretend we are using PyTorch < 2.0
-    monkeypatch.setattr(lightning.pytorch.trainer.trainer, "_TORCH_GREATER_EQUAL_2_0", False)
-    with pytest.warns(PossibleUserWarning, match="can't place .* on the device"), trainer.init_module():
-        pass
-    strategy.tensor_init_context.assert_called_once()
-
 
 def test_expand_home_trainer():
     """Test that the dirpath gets expanded if it contains `~`."""
diff --git a/tests/tests_pytorch/utilities/test_compile.py b/tests/tests_pytorch/utilities/test_compile.py
index 42daba6e05a36..9da6c390e5da1 100644
--- a/tests/tests_pytorch/utilities/test_compile.py
+++ b/tests/tests_pytorch/utilities/test_compile.py
@@ -25,6 +25,8 @@
 from tests_pytorch.conftest import mock_cuda_count
 from tests_pytorch.helpers.runif import RunIf
 
+_PYTHON_GREATER_EQUAL_3_9_0 = (sys.version_info.major, sys.version_info.minor) >= (3, 9)
+
 
 # https://github.com/pytorch/pytorch/issues/95708
 @pytest.mark.skipif(sys.platform == "darwin", reason="fatal error: 'omp.h' file not found")
@@ -115,6 +117,7 @@ def has_dynamo(fn):
 
 # https://github.com/pytorch/pytorch/issues/95708
 @pytest.mark.skipif(sys.platform == "darwin", reason="fatal error: 'omp.h' file not found")
+@pytest.mark.skipif(not _PYTHON_GREATER_EQUAL_3_9_0, reason="AssertionError: failed to reach fixed point")
 @pytest.mark.xfail(
     sys.platform == "win32" and _TORCH_GREATER_EQUAL_2_2, strict=False, reason="RuntimeError: Failed to import"
 )
@@ -144,6 +147,7 @@ def training_step(self, batch, batch_idx):
 
 # https://github.com/pytorch/pytorch/issues/95708
 @pytest.mark.skipif(sys.platform == "darwin", reason="fatal error: 'omp.h' file not found")
+@pytest.mark.skipif(not _PYTHON_GREATER_EQUAL_3_9_0, reason="AssertionError: failed to reach fixed point")
 @pytest.mark.xfail(
     sys.platform == "win32" and _TORCH_GREATER_EQUAL_2_2, strict=False, reason="RuntimeError: Failed to import"
 )
diff --git a/tests/tests_pytorch/utilities/test_model_summary.py b/tests/tests_pytorch/utilities/test_model_summary.py
index 290dfb67faf7d..a50ec425fc894 100644
--- a/tests/tests_pytorch/utilities/test_model_summary.py
+++ b/tests/tests_pytorch/utilities/test_model_summary.py
@@ -17,7 +17,6 @@
 import pytest
 import torch
 import torch.nn as nn
-from lightning.fabric.utilities.imports import _TORCH_GREATER_EQUAL_2_0
 from lightning.pytorch import LightningModule, Trainer
 from lightning.pytorch.demos.boring_classes import BoringModel
 from lightning.pytorch.utilities.model_summary.model_summary import (
@@ -294,10 +293,6 @@ def __init__(self):
         def forward(self, *args, **kwargs):
             return self.layer(*args, **kwargs)
 
-    if isinstance(example_input, dict) and not _TORCH_GREATER_EQUAL_2_0:
-        # kwargs are not supported when torch < 2.0
-        expected_size = UNKNOWN_SIZE
-
     model = DummyLightningModule()
     model.example_input_array = example_input
     summary = summarize(model, max_depth=max_depth)