Skip to content

Commit

Permalink
fix(callbacks/lightning): populate callback for lightning (#114)
Browse files Browse the repository at this point in the history
  • Loading branch information
XuehaiPan authored Dec 17, 2023
1 parent b50b837 commit bff355b
Show file tree
Hide file tree
Showing 9 changed files with 203 additions and 17 deletions.
6 changes: 3 additions & 3 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -25,16 +25,16 @@ repos:
- id: debug-statements
- id: double-quote-string-fixer
- repo: https://github.com/astral-sh/ruff-pre-commit
rev: v0.1.7
rev: v0.1.8
hooks:
- id: ruff
args: [--fix, --exit-non-zero-on-fix]
- repo: https://github.com/PyCQA/isort
rev: 5.13.0
rev: 5.13.2
hooks:
- id: isort
- repo: https://github.com/psf/black
rev: 23.11.0
rev: 23.12.0
hooks:
- id: black
- repo: https://github.com/asottile/pyupgrade
Expand Down
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0

### Added

- Add separate implementation for `GpuStatsLogger` callback for `lightning` by [@XuehaiPan](https://github.com/XuehaiPan) in [#114](https://github.com/XuehaiPan/nvitop/pull/114).
- Remove metrics if process is gone in `nvitop-exporter` by [@XuehaiPan](https://github.com/XuehaiPan) in [#107](https://github.com/XuehaiPan/nvitop/pull/107).

### Changed
Expand Down
6 changes: 3 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -577,11 +577,11 @@ model.fit(.., callbacks=[gpu_stats, tb_callback])

**NOTE:** Users should assign a `keras.callbacks.TensorBoard` callback or a `keras.callbacks.CSVLogger` callback to the model. And the `GpuStatsLogger` callback should be placed before the `keras.callbacks.TensorBoard` / `keras.callbacks.CSVLogger` callback.

#### Callback for [PyTorch Lightning](https://pytorchlightning.ai)
#### Callback for [PyTorch Lightning](https://lightning.ai)

```python
from pytorch_lightning import Trainer
from nvitop.callbacks.pytorch_lightning import GpuStatsLogger
from lightning.pytorch import Trainer
from nvitop.callbacks.lightning import GpuStatsLogger
gpu_stats = GpuStatsLogger()
trainer = Trainer(gpus=[..], logger=True, callbacks=[gpu_stats])
```
Expand Down
1 change: 1 addition & 0 deletions docs/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,5 +4,6 @@ sphinx-autobuild
sphinx-copybutton
sphinx-rtd-theme

lightning >= 2.0.0, < 3.0.0a0
pytorch-lightning >= 1.5.0, < 2.0.0a0
tensorflow-cpu >= 2.0.0, < 2.12.0a0
8 changes: 8 additions & 0 deletions docs/source/callbacks.rst
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,14 @@ nvitop.callbacks.keras module
:undoc-members:
:show-inheritance:

nvitop.callbacks.lightning module
---------------------------------

.. automodule:: nvitop.callbacks.lightning
:members:
:undoc-members:
:show-inheritance:

nvitop.callbacks.pytorch\_lightning module
------------------------------------------

Expand Down
1 change: 1 addition & 0 deletions docs/source/spelling_wordlist.txt
Original file line number Diff line number Diff line change
Expand Up @@ -153,3 +153,4 @@ ThroughputInfo
pytorch
api
utils
GpuStatsLogger
7 changes: 3 additions & 4 deletions nvitop/callbacks/keras.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,15 +23,14 @@
import re
import time

from tensorflow.python.keras.callbacks import ( # pylint: disable=import-error,no-name-in-module
Callback,
)
# pylint: disable-next=import-error,no-name-in-module
from tensorflow.python.keras.callbacks import Callback

from nvitop.api import libnvml
from nvitop.callbacks.utils import get_devices_by_logical_ids, get_gpu_stats


# Ported version of .pytorch_lightning.GpuStatsLogger for Keras
# Ported version of nvitop.callbacks.lightning.GpuStatsLogger for Keras
class GpuStatsLogger(Callback): # pylint: disable=too-many-instance-attributes
"""Automatically log GPU stats during training stage. :class:`GpuStatsLogger` is a callback and
in order to use it you need to assign a TensorBoard callback or a CSVLogger callback to the model.
Expand Down
170 changes: 167 additions & 3 deletions nvitop/callbacks/lightning.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,171 @@
# limitations under the License.
# ==============================================================================

# pylint: disable=missing-module-docstring
# pylint: disable=missing-module-docstring,missing-function-docstring
# pylint: disable=unused-argument,attribute-defined-outside-init

# pylint: disable-next=unused-import
from nvitop.callbacks.pytorch_lightning import GpuStatsLogger # noqa: F401
from __future__ import annotations

import time
from typing import Any

import lightning.pytorch as pl # pylint: disable=import-error
from lightning.pytorch.callbacks import Callback # pylint: disable=import-error
from lightning.pytorch.utilities import rank_zero_only # pylint: disable=import-error
from lightning.pytorch.utilities.exceptions import ( # pylint: disable=import-error
MisconfigurationException,
)

from nvitop.api import libnvml
from nvitop.callbacks.utils import get_devices_by_logical_ids, get_gpu_stats


# Modified from pytorch_lightning.callbacks.GPUStatsMonitor
class GpuStatsLogger(Callback): # pylint: disable=too-many-instance-attributes
"""Automatically log GPU stats during training stage. :class:`GpuStatsLogger` is a callback and
in order to use it you need to assign a logger in the ``Trainer``.
Args:
memory_utilization (bool):
Set to :data:`True` to log used, free and the percentage of memory utilization at the
start and end of each step. Default: :data:`True`.
gpu_utilization (bool):
Set to :data:`True` to log the percentage of GPU utilization at the start and end of
each step. Default: :data:`True`.
intra_step_time (bool):
Set to :data:`True` to log the time of each step. Default: :data:`False`.
inter_step_time (bool):
Set to :data:`True` to log the time between the end of one step and the start of the
next step. Default: :data:`False`.
fan_speed (bool):
Set to :data:`True` to log percentage of fan speed. Default: :data:`False`.
temperature (bool):
Set to :data:`True` to log the gpu temperature in degree Celsius. Default: :data:`False`.
Raises:
MisconfigurationException:
If NVIDIA driver is not installed, not running on GPUs, or ``Trainer`` has no logger.
Examples:
>>> from lightning.pytorch import Trainer
>>> from nvitop.callbacks.lightning import GpuStatsLogger
>>> gpu_stats = GpuStatsLogger()
>>> trainer = Trainer(gpus=[..], logger=True, callbacks=[gpu_stats])
GPU stats are mainly based on NVML queries. The description of the queries is as follows:
- **fan.speed** - The fan speed value is the percent of maximum speed that the device's fan is
currently intended to run at. It ranges from 0 to 100 %. Note: The reported speed is the
intended fan speed. If the fan is physically blocked and unable to spin, this output will not
match the actual fan speed. Many parts do not report fan speeds because they rely on cooling
via fans in the surrounding enclosure.
- **memory.used** - Total memory allocated by active contexts, in MiBs.
- **memory.free** - Total free memory, in MiBs.
- **utilization.gpu** - Percent of time over the past sample period during which one or more
kernels was executing on the GPU. The sample period may be between 1 second and 1/6 second
depending on the product.
- **utilization.memory** - Percent of time over the past sample period during which global
(device) memory was being read or written. The sample period may be between 1 second and 1/6
second depending on the product.
- **temperature** - Core GPU temperature, in degrees C.
"""

def __init__( # pylint: disable=too-many-arguments
self,
memory_utilization: bool = True,
gpu_utilization: bool = True,
intra_step_time: bool = False,
inter_step_time: bool = False,
fan_speed: bool = False,
temperature: bool = False,
) -> None:
super().__init__()

try:
libnvml.nvmlInit()
except libnvml.NVMLError as ex:
raise MisconfigurationException(
'Cannot use GpuStatsLogger callback because NVIDIA driver is not installed.',
) from ex

self._memory_utilization = memory_utilization
self._gpu_utilization = gpu_utilization
self._intra_step_time = intra_step_time
self._inter_step_time = inter_step_time
self._fan_speed = fan_speed
self._temperature = temperature

def on_train_start(self, trainer: pl.Trainer, pl_module: pl.LightningModule) -> None:
if not trainer.logger:
raise MisconfigurationException(
'Cannot use GpuStatsLogger callback with Trainer that has no logger.',
)

if trainer.strategy.root_device.type != 'cuda':
raise MisconfigurationException(
f'You are using GpuStatsLogger but are not running on GPU. '
f'The root device type is {trainer.strategy.root_device.type}.',
)

device_ids = trainer.device_ids

try:
self._devices = get_devices_by_logical_ids(device_ids, unique=True)
except (libnvml.NVMLError, RuntimeError) as ex:
raise ValueError(
f'Cannot use GpuStatsLogger callback because devices unavailable. '
f'Received: `gpus={device_ids}`',
) from ex

def on_train_epoch_start(self, trainer: pl.Trainer, pl_module: pl.LightningModule) -> None:
self._snap_intra_step_time = None
self._snap_inter_step_time = None

@rank_zero_only
def on_train_batch_start( # pylint: disable=arguments-differ
self,
trainer: pl.Trainer,
pl_module: pl.LightningModule,
**kwargs: Any,
) -> None:
if self._intra_step_time:
self._snap_intra_step_time = time.monotonic()

logs = self._get_gpu_stats()

if self._inter_step_time and self._snap_inter_step_time:
# First log at beginning of second step
logs['batch_time/inter_step (ms)'] = 1000.0 * (
time.monotonic() - self._snap_inter_step_time
)

trainer.logger.log_metrics(logs, step=trainer.global_step)

@rank_zero_only
def on_train_batch_end( # pylint: disable=arguments-differ
self,
trainer: pl.Trainer,
pl_module: pl.LightningModule,
**kwargs: Any,
) -> None:
if self._inter_step_time:
self._snap_inter_step_time = time.monotonic()

logs = self._get_gpu_stats()

if self._intra_step_time and self._snap_intra_step_time:
logs['batch_time/intra_step (ms)'] = 1000.0 * (
time.monotonic() - self._snap_intra_step_time
)

trainer.logger.log_metrics(logs, step=trainer.global_step)

def _get_gpu_stats(self) -> dict[str, float]:
"""Get the gpu status from NVML queries."""
return get_gpu_stats(
devices=self._devices,
memory_utilization=self._memory_utilization,
gpu_utilization=self._gpu_utilization,
fan_speed=self._fan_speed,
temperature=self._temperature,
)
20 changes: 16 additions & 4 deletions nvitop/callbacks/pytorch_lightning.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,9 @@
from __future__ import annotations

import time
from typing import Any

import pytorch_lightning as pl # pylint: disable=import-error
from pytorch_lightning.callbacks import Callback # pylint: disable=import-error
from pytorch_lightning.utilities import rank_zero_only # pylint: disable=import-error
from pytorch_lightning.utilities.exceptions import ( # pylint: disable=import-error
Expand Down Expand Up @@ -107,7 +109,7 @@ def __init__( # pylint: disable=too-many-arguments
self._fan_speed = fan_speed
self._temperature = temperature

def on_train_start(self, trainer, pl_module) -> None:
def on_train_start(self, trainer: pl.Trainer, pl_module: pl.LightningModule) -> None:
if not trainer.logger:
raise MisconfigurationException(
'Cannot use GpuStatsLogger callback with Trainer that has no logger.',
Expand All @@ -132,12 +134,17 @@ def on_train_start(self, trainer, pl_module) -> None:
f'Received: `gpus={device_ids}`',
) from ex

def on_train_epoch_start(self, trainer, pl_module) -> None:
def on_train_epoch_start(self, trainer: pl.Trainer, pl_module: pl.LightningModule) -> None:
self._snap_intra_step_time = None
self._snap_inter_step_time = None

@rank_zero_only
def on_train_batch_start(self, trainer, **kwargs) -> None: # pylint: disable=arguments-differ
def on_train_batch_start( # pylint: disable=arguments-differ
self,
trainer: pl.Trainer,
pl_module: pl.LightningModule,
**kwargs: Any,
) -> None:
if self._intra_step_time:
self._snap_intra_step_time = time.monotonic()

Expand All @@ -152,7 +159,12 @@ def on_train_batch_start(self, trainer, **kwargs) -> None: # pylint: disable=ar
trainer.logger.log_metrics(logs, step=trainer.global_step)

@rank_zero_only
def on_train_batch_end(self, trainer, **kwargs) -> None: # pylint: disable=arguments-differ
def on_train_batch_end( # pylint: disable=arguments-differ
self,
trainer: pl.Trainer,
pl_module: pl.LightningModule,
**kwargs: Any,
) -> None:
if self._inter_step_time:
self._snap_inter_step_time = time.monotonic()

Expand Down

0 comments on commit bff355b

Please sign in to comment.