aai-institute · schroedk · May 7, 2024 · May 6, 2024 · May 6, 2024 · May 6, 2024
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -14,6 +14,10 @@
   implementation [PR #570](https://github.com/aai-institute/pyDVL/pull/570)
 - Missing move to device of `preconditioner` in `CgInfluence` implementation
   [PR #572](https://github.com/aai-institute/pyDVL/pull/572)
+- Raise a more specific error message, when a `RunTimeError` occurs in 
+  `torch.linalg.eigh`, so the user can check if it is related to a known
+  issue
+  [PR #578](https://github.com/aai-institute/pyDVL/pull/578)
 
 ## 0.9.1 - Bug fixes, logging improvement
 

diff --git a/src/pydvl/influence/torch/influence_function_model.py b/src/pydvl/influence/torch/influence_function_model.py
@@ -39,6 +39,7 @@
     EkfacRepresentation,
     empirical_cross_entropy_loss_fn,
     flatten_dimensions,
+    safe_torch_linalg_eigh,
 )
 
 logger = logging.getLogger(__name__)
@@ -1227,8 +1228,8 @@ def fit(self, data: DataLoader) -> EkfacInfluence:
         layers_evect_g = {}
         layers_diags = {}
         for key in self.active_layers.keys():
-            evals_a, evecs_a = torch.linalg.eigh(forward_x[key])
-            evals_g, evecs_g = torch.linalg.eigh(grad_y[key])
+            evals_a, evecs_a = safe_torch_linalg_eigh(forward_x[key])
+            evals_g, evecs_g = safe_torch_linalg_eigh(grad_y[key])
             layers_evecs_a[key] = evecs_a
             layers_evect_g[key] = evecs_g
             layers_diags[key] = torch.kron(evals_g.view(-1, 1), evals_a.view(-1, 1))

diff --git a/src/pydvl/influence/torch/util.py b/src/pydvl/influence/torch/util.py
@@ -22,6 +22,7 @@
 from numpy.typing import NDArray
 from torch.utils.data import Dataset
 
+from ...utils.exceptions import catch_and_raise_exception
 from ..array import NestedSequenceAggregator, NumpyConverter, SequenceAggregator
 
 logger = logging.getLogger(__name__)
@@ -521,3 +522,49 @@ def empirical_cross_entropy_loss_fn(
         torch.isfinite(log_probs_), log_probs_, torch.zeros_like(log_probs_)
     )
     return torch.sum(log_probs_ * probs_.detach() ** 0.5)
+
+
+@catch_and_raise_exception(RuntimeError, lambda e: TorchLinalgEighException(e))
+def safe_torch_linalg_eigh(*args, **kwargs):
+    """
+    A wrapper around `torch.linalg.eigh` that safely handles potential runtime errors
+    by raising a custom `TorchLinalgEighException` with more context,
+    especially related to the issues reported in
+    [https://github.com/pytorch/pytorch/issues/92141](
+    https://github.com/pytorch/pytorch/issues/92141).
+
+    Args:
+        *args: Positional arguments passed to `torch.linalg.eigh`.
+        **kwargs: Keyword arguments passed to `torch.linalg.eigh`.
+
+    Returns:
+        The result of calling `torch.linalg.eigh` with the provided arguments.
+
+    Raises:
+        TorchLinalgEighException: If a `RuntimeError` occurs during the execution of
+            `torch.linalg.eigh`.
+    """
+    return torch.linalg.eigh(*args, **kwargs)
+
+
+class TorchLinalgEighException(Exception):
+    """
+    Exception to wrap a RunTimeError raised by torch.linalg.eigh, when used
+    with large matrices,
+    see [https://github.com/pytorch/pytorch/issues/92141](
+    https://github.com/pytorch/pytorch/issues/92141)
+    """
+
+    def __init__(self, original_exception: RuntimeError):
+        func = torch.linalg.eigh
+        err_msg = (
+            f"A RunTimeError occurred in '{func.__module__}.{func.__qualname__}'. "
+            "This might be related to known issues with "
+            "[torch.linalg.eigh][torch.linalg.eigh] on certain matrix sizes.\n "
+            "For more details, refer to "
+            "https://github.com/pytorch/pytorch/issues/92141. \n"
+            "In this case, consider to use a different implementation, which does not "
+            "depend on the usage of [torch.linalg.eigh][torch.linalg.eigh].\n"
+            f" Inspect the original exception message: \n{str(original_exception)}"
+        )
+        super().__init__(err_msg)
diff --git a/src/pydvl/utils/exceptions.py b/src/pydvl/utils/exceptions.py
@@ -0,0 +1,59 @@
+from functools import wraps
+from typing import Callable, Type, TypeVar
+
+CatchExceptionType = TypeVar("CatchExceptionType", bound=BaseException)
+RaiseExceptionType = TypeVar("RaiseExceptionType", bound=BaseException)
+
+
+def catch_and_raise_exception(
+    catch_exception_type: Type[CatchExceptionType],
+    raise_exception_factory: Callable[[CatchExceptionType], RaiseExceptionType],
+) -> Callable:
+    """
+    A decorator that catches exceptions of a specified exception type and raises
+    another specified exception.
+
+    Args:
+        catch_exception_type: The type of the exception to catch.
+        raise_exception_factory: A factory function that creates a new exception.
+
+    Returns:
+        A decorator function that wraps the target function.
+
+    ??? Example
+
+        ```python
+        @catch_and_raise_exception(RuntimeError, lambda e: TorchLinalgEighException(e))
+        def safe_torch_linalg_eigh(*args, **kwargs):
+            '''
+            A wrapper around `torch.linalg.eigh` that safely handles potential runtime errors
+            by raising a custom `TorchLinalgEighException` with more context,
+            especially related to the issues reported in
+            https://github.com/pytorch/pytorch/issues/92141.
+
+            Args:
+            *args: Positional arguments passed to `torch.linalg.eigh`.
+            **kwargs: Keyword arguments passed to `torch.linalg.eigh`.
+
+            Returns:
+            The result of calling `torch.linalg.eigh` with the provided arguments.
+
+            Raises:
+            TorchLinalgEighException: If a `RuntimeError` occurs during the execution of
+            `torch.linalg.eigh`.
+            '''
+            return torch.linalg.eigh(*args, **kwargs)
+        ```
+    """
+
+    def decorator(func):
+        @wraps(func)
+        def wrapper(*args, **kwargs):
+            try:
+                return func(*args, **kwargs)
+            except catch_exception_type as e:
+                raise raise_exception_factory(e) from e
+
+        return wrapper
+
+    return decorator
diff --git a/tests/conftest.py b/tests/conftest.py
@@ -1,5 +1,6 @@
 import logging
 import os
+import platform
 from dataclasses import asdict
 from typing import TYPE_CHECKING, Optional, Tuple
 
@@ -264,3 +265,7 @@ def pytest_terminal_summary(
 ):
     tolerate_session = terminalreporter.config._tolerate_session
     tolerate_session.display(terminalreporter)
+
+
+def is_osx_arm64():
+    return platform.system() == "Darwin" and platform.machine() == "arm64"
diff --git a/tests/influence/torch/test_util.py b/tests/influence/torch/test_util.py
@@ -17,11 +17,14 @@
     lanzcos_low_rank_hessian_approx,
 )
 from pydvl.influence.torch.util import (
+    TorchLinalgEighException,
     TorchTensorContainerType,
     align_structure,
     flatten_dimensions,
+    safe_torch_linalg_eigh,
     torch_dataset_to_dask_array,
 )
+from tests.conftest import is_osx_arm64
 from tests.influence.conftest import linear_hessian_analytical, linear_model
 
 
@@ -297,3 +300,21 @@ def are_active_layers_linear(model):
                 if any(param_requires_grad):
                     return False
     return True
+
+
+@pytest.mark.torch
+def test_safe_torch_linalg_eigh():
+    t = torch.randn([10, 10])
+    t = t @ t.t()
+    safe_eigs, safe_eigvec = safe_torch_linalg_eigh(t)
+    eigs, eigvec = torch.linalg.eigh(t)
+    assert torch.allclose(safe_eigs, eigs)
+    assert torch.allclose(safe_eigvec, eigvec)
+
+
+@pytest.mark.torch
+@pytest.mark.slow
+@pytest.mark.skipif(not is_osx_arm64(), reason="Requires macOS ARM64.")
+def test_safe_torch_linalg_eigh_exception():
+    with pytest.raises(TorchLinalgEighException):
+        safe_torch_linalg_eigh(torch.randn([53000, 53000]))