fix(callbacks/lightning): populate callback for lightning (#114)

XuehaiPan · Dec 17, 2023 · bff355b · bff355b
1 parent b50b837
commit bff355b
Show file tree

Hide file tree

Showing 9 changed files with 203 additions and 17 deletions.
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -25,16 +25,16 @@ repos:
       - id: debug-statements
       - id: double-quote-string-fixer
   - repo: https://github.com/astral-sh/ruff-pre-commit
-    rev: v0.1.7
+    rev: v0.1.8
     hooks:
       - id: ruff
         args: [--fix, --exit-non-zero-on-fix]
   - repo: https://github.com/PyCQA/isort
-    rev: 5.13.0
+    rev: 5.13.2
     hooks:
       - id: isort
   - repo: https://github.com/psf/black
-    rev: 23.11.0
+    rev: 23.12.0
     hooks:
       - id: black
   - repo: https://github.com/asottile/pyupgrade

diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -13,6 +13,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ### Added
 
+- Add separate implementation for `GpuStatsLogger` callback for `lightning` by [@XuehaiPan](https://github.com/XuehaiPan) in [#114](https://github.com/XuehaiPan/nvitop/pull/114).
 - Remove metrics if process is gone in `nvitop-exporter` by [@XuehaiPan](https://github.com/XuehaiPan) in [#107](https://github.com/XuehaiPan/nvitop/pull/107).
 
 ### Changed

diff --git a/README.md b/README.md
@@ -577,11 +577,11 @@ model.fit(.., callbacks=[gpu_stats, tb_callback])
 
 **NOTE:** Users should assign a `keras.callbacks.TensorBoard` callback or a `keras.callbacks.CSVLogger` callback to the model. And the `GpuStatsLogger` callback should be placed before the `keras.callbacks.TensorBoard` / `keras.callbacks.CSVLogger` callback.
 
-#### Callback for [PyTorch Lightning](https://pytorchlightning.ai)
+#### Callback for [PyTorch Lightning](https://lightning.ai)
 
 ```python
-from pytorch_lightning import Trainer
-from nvitop.callbacks.pytorch_lightning import GpuStatsLogger
+from lightning.pytorch import Trainer
+from nvitop.callbacks.lightning import GpuStatsLogger
 gpu_stats = GpuStatsLogger()
 trainer = Trainer(gpus=[..], logger=True, callbacks=[gpu_stats])
 ```

diff --git a/docs/requirements.txt b/docs/requirements.txt
@@ -4,5 +4,6 @@ sphinx-autobuild
 sphinx-copybutton
 sphinx-rtd-theme
 
+lightning >= 2.0.0, < 3.0.0a0
 pytorch-lightning >= 1.5.0, < 2.0.0a0
 tensorflow-cpu >= 2.0.0, < 2.12.0a0
diff --git a/docs/source/callbacks.rst b/docs/source/callbacks.rst
@@ -12,6 +12,14 @@ nvitop.callbacks.keras module
     :undoc-members:
     :show-inheritance:
 
+nvitop.callbacks.lightning module
+---------------------------------
+
+.. automodule:: nvitop.callbacks.lightning
+    :members:
+    :undoc-members:
+    :show-inheritance:
+
 nvitop.callbacks.pytorch\_lightning module
 ------------------------------------------
 

diff --git a/docs/source/spelling_wordlist.txt b/docs/source/spelling_wordlist.txt
@@ -153,3 +153,4 @@ ThroughputInfo
 pytorch
 api
 utils
+GpuStatsLogger
diff --git a/nvitop/callbacks/keras.py b/nvitop/callbacks/keras.py
@@ -23,15 +23,14 @@
 import re
 import time
 
-from tensorflow.python.keras.callbacks import (  # pylint: disable=import-error,no-name-in-module
-    Callback,
-)
+# pylint: disable-next=import-error,no-name-in-module
+from tensorflow.python.keras.callbacks import Callback
 
 from nvitop.api import libnvml
 from nvitop.callbacks.utils import get_devices_by_logical_ids, get_gpu_stats
 
 
-# Ported version of .pytorch_lightning.GpuStatsLogger for Keras
+# Ported version of nvitop.callbacks.lightning.GpuStatsLogger for Keras
 class GpuStatsLogger(Callback):  # pylint: disable=too-many-instance-attributes
     """Automatically log GPU stats during training stage. :class:`GpuStatsLogger` is a callback and
     in order to use it you need to assign a TensorBoard callback or a CSVLogger callback to the model.

diff --git a/nvitop/callbacks/lightning.py b/nvitop/callbacks/lightning.py
@@ -15,7 +15,171 @@
 # limitations under the License.
 # ==============================================================================
 
-# pylint: disable=missing-module-docstring
+# pylint: disable=missing-module-docstring,missing-function-docstring
+# pylint: disable=unused-argument,attribute-defined-outside-init
 
-# pylint: disable-next=unused-import
-from nvitop.callbacks.pytorch_lightning import GpuStatsLogger  # noqa: F401
+from __future__ import annotations
+
+import time
+from typing import Any
+
+import lightning.pytorch as pl  # pylint: disable=import-error
+from lightning.pytorch.callbacks import Callback  # pylint: disable=import-error
+from lightning.pytorch.utilities import rank_zero_only  # pylint: disable=import-error
+from lightning.pytorch.utilities.exceptions import (  # pylint: disable=import-error
+    MisconfigurationException,
+)
+
+from nvitop.api import libnvml
+from nvitop.callbacks.utils import get_devices_by_logical_ids, get_gpu_stats
+
+
+# Modified from pytorch_lightning.callbacks.GPUStatsMonitor
+class GpuStatsLogger(Callback):  # pylint: disable=too-many-instance-attributes
+    """Automatically log GPU stats during training stage. :class:`GpuStatsLogger` is a callback and
+    in order to use it you need to assign a logger in the ``Trainer``.
+
+    Args:
+        memory_utilization (bool):
+            Set to :data:`True` to log used, free and the percentage of memory utilization at the
+            start and end of each step. Default: :data:`True`.
+        gpu_utilization (bool):
+            Set to :data:`True` to log the percentage of GPU utilization at the start and end of
+            each step. Default: :data:`True`.
+        intra_step_time (bool):
+            Set to :data:`True` to log the time of each step. Default: :data:`False`.
+        inter_step_time (bool):
+            Set to :data:`True` to log the time between the end of one step and the start of the
+            next step. Default: :data:`False`.
+        fan_speed (bool):
+            Set to :data:`True` to log percentage of fan speed. Default: :data:`False`.
+        temperature (bool):
+            Set to :data:`True` to log the gpu temperature in degree Celsius. Default: :data:`False`.
+
+    Raises:
+        MisconfigurationException:
+            If NVIDIA driver is not installed, not running on GPUs, or ``Trainer`` has no logger.
+
+    Examples:
+        >>> from lightning.pytorch import Trainer
+        >>> from nvitop.callbacks.lightning import GpuStatsLogger
+        >>> gpu_stats = GpuStatsLogger()
+        >>> trainer = Trainer(gpus=[..], logger=True, callbacks=[gpu_stats])
+
+    GPU stats are mainly based on NVML queries. The description of the queries is as follows:
+
+    - **fan.speed** - The fan speed value is the percent of maximum speed that the device's fan is
+      currently intended to run at. It ranges from 0 to 100 %. Note: The reported speed is the
+      intended fan speed. If the fan is physically blocked and unable to spin, this output will not
+      match the actual fan speed. Many parts do not report fan speeds because they rely on cooling
+      via fans in the surrounding enclosure.
+    - **memory.used** - Total memory allocated by active contexts, in MiBs.
+    - **memory.free** - Total free memory, in MiBs.
+    - **utilization.gpu** - Percent of time over the past sample period during which one or more
+      kernels was executing on the GPU. The sample period may be between 1 second and 1/6 second
+      depending on the product.
+    - **utilization.memory** - Percent of time over the past sample period during which global
+      (device) memory was being read or written. The sample period may be between 1 second and 1/6
+      second depending on the product.
+    - **temperature** - Core GPU temperature, in degrees C.
+    """
+
+    def __init__(  # pylint: disable=too-many-arguments
+        self,
+        memory_utilization: bool = True,
+        gpu_utilization: bool = True,
+        intra_step_time: bool = False,
+        inter_step_time: bool = False,
+        fan_speed: bool = False,
+        temperature: bool = False,
+    ) -> None:
+        super().__init__()
+
+        try:
+            libnvml.nvmlInit()
+        except libnvml.NVMLError as ex:
+            raise MisconfigurationException(
+                'Cannot use GpuStatsLogger callback because NVIDIA driver is not installed.',
+            ) from ex
+
+        self._memory_utilization = memory_utilization
+        self._gpu_utilization = gpu_utilization
+        self._intra_step_time = intra_step_time
+        self._inter_step_time = inter_step_time
+        self._fan_speed = fan_speed
+        self._temperature = temperature
+
+    def on_train_start(self, trainer: pl.Trainer, pl_module: pl.LightningModule) -> None:
+        if not trainer.logger:
+            raise MisconfigurationException(
+                'Cannot use GpuStatsLogger callback with Trainer that has no logger.',
+            )
+
+        if trainer.strategy.root_device.type != 'cuda':
+            raise MisconfigurationException(
+                f'You are using GpuStatsLogger but are not running on GPU. '
+                f'The root device type is {trainer.strategy.root_device.type}.',
+            )
+
+        device_ids = trainer.device_ids
+
+        try:
+            self._devices = get_devices_by_logical_ids(device_ids, unique=True)
+        except (libnvml.NVMLError, RuntimeError) as ex:
+            raise ValueError(
+                f'Cannot use GpuStatsLogger callback because devices unavailable. '
+                f'Received: `gpus={device_ids}`',
+            ) from ex
+
+    def on_train_epoch_start(self, trainer: pl.Trainer, pl_module: pl.LightningModule) -> None:
+        self._snap_intra_step_time = None
+        self._snap_inter_step_time = None
+
+    @rank_zero_only
+    def on_train_batch_start(  # pylint: disable=arguments-differ
+        self,
+        trainer: pl.Trainer,
+        pl_module: pl.LightningModule,
+        **kwargs: Any,
+    ) -> None:
+        if self._intra_step_time:
+            self._snap_intra_step_time = time.monotonic()
+
+        logs = self._get_gpu_stats()
+
+        if self._inter_step_time and self._snap_inter_step_time:
+            # First log at beginning of second step
+            logs['batch_time/inter_step (ms)'] = 1000.0 * (
+                time.monotonic() - self._snap_inter_step_time
+            )
+
+        trainer.logger.log_metrics(logs, step=trainer.global_step)
+
+    @rank_zero_only
+    def on_train_batch_end(  # pylint: disable=arguments-differ
+        self,
+        trainer: pl.Trainer,
+        pl_module: pl.LightningModule,
+        **kwargs: Any,
+    ) -> None:
+        if self._inter_step_time:
+            self._snap_inter_step_time = time.monotonic()
+
+        logs = self._get_gpu_stats()
+
+        if self._intra_step_time and self._snap_intra_step_time:
+            logs['batch_time/intra_step (ms)'] = 1000.0 * (
+                time.monotonic() - self._snap_intra_step_time
+            )
+
+        trainer.logger.log_metrics(logs, step=trainer.global_step)
+
+    def _get_gpu_stats(self) -> dict[str, float]:
+        """Get the gpu status from NVML queries."""
+        return get_gpu_stats(
+            devices=self._devices,
+            memory_utilization=self._memory_utilization,
+            gpu_utilization=self._gpu_utilization,
+            fan_speed=self._fan_speed,
+            temperature=self._temperature,
+        )
diff --git a/nvitop/callbacks/pytorch_lightning.py b/nvitop/callbacks/pytorch_lightning.py
@@ -21,7 +21,9 @@
 from __future__ import annotations
 
 import time
+from typing import Any
 
+import pytorch_lightning as pl  # pylint: disable=import-error
 from pytorch_lightning.callbacks import Callback  # pylint: disable=import-error
 from pytorch_lightning.utilities import rank_zero_only  # pylint: disable=import-error
 from pytorch_lightning.utilities.exceptions import (  # pylint: disable=import-error
@@ -107,7 +109,7 @@ def __init__(  # pylint: disable=too-many-arguments
         self._fan_speed = fan_speed
         self._temperature = temperature
 
-    def on_train_start(self, trainer, pl_module) -> None:
+    def on_train_start(self, trainer: pl.Trainer, pl_module: pl.LightningModule) -> None:
         if not trainer.logger:
             raise MisconfigurationException(
                 'Cannot use GpuStatsLogger callback with Trainer that has no logger.',
@@ -132,12 +134,17 @@ def on_train_start(self, trainer, pl_module) -> None:
                 f'Received: `gpus={device_ids}`',
             ) from ex
 
-    def on_train_epoch_start(self, trainer, pl_module) -> None:
+    def on_train_epoch_start(self, trainer: pl.Trainer, pl_module: pl.LightningModule) -> None:
         self._snap_intra_step_time = None
         self._snap_inter_step_time = None
 
     @rank_zero_only
-    def on_train_batch_start(self, trainer, **kwargs) -> None:  # pylint: disable=arguments-differ
+    def on_train_batch_start(  # pylint: disable=arguments-differ
+        self,
+        trainer: pl.Trainer,
+        pl_module: pl.LightningModule,
+        **kwargs: Any,
+    ) -> None:
         if self._intra_step_time:
             self._snap_intra_step_time = time.monotonic()
 
@@ -152,7 +159,12 @@ def on_train_batch_start(self, trainer, **kwargs) -> None:  # pylint: disable=ar
         trainer.logger.log_metrics(logs, step=trainer.global_step)
 
     @rank_zero_only
-    def on_train_batch_end(self, trainer, **kwargs) -> None:  # pylint: disable=arguments-differ
+    def on_train_batch_end(  # pylint: disable=arguments-differ
+        self,
+        trainer: pl.Trainer,
+        pl_module: pl.LightningModule,
+        **kwargs: Any,
+    ) -> None:
         if self._inter_step_time:
             self._snap_inter_step_time = time.monotonic()