Replace PositiveDefinite link with CholeskyFactor

han-ol · han-ol · commit ef97a6c934d6 · 2025-06-10T12:00:04.000+02:00
This finally makes the MVN score sampling test stable for the jax backend,
for which the keras.ops.cholesky operation is numerically unstable.

The score's sample method avoids calling keras.ops.cholesky to resolve
the issue. Instead the estimation head returns the Cholesky factor
directly rather than the covariance matrix (as it used to be).
diff --git a/bayesflow/links/__init__.py b/bayesflow/links/__init__.py
@@ -2,7 +2,7 @@
 
 from .ordered import Ordered
 from .ordered_quantiles import OrderedQuantiles
-from .positive_definite import PositiveDefinite
+from .cholesky_factor import CholeskyFactor
 
 from ..utils._docs import _add_imports_to_all
 
diff --git a/bayesflow/links/cholesky_factor.py b/bayesflow/links/cholesky_factor.py
@@ -6,8 +6,8 @@
 
 
 @serializable("bayesflow.links")
-class PositiveDefinite(keras.Layer):
-    """Activation function to link from flat elements of a lower triangular matrix to a positive definite matrix."""
+class CholeskyFactor(keras.Layer):
+    """Activation function to link from a flat tensor to a lower triangular matrix with positive diagonal."""
 
     def __init__(self, **kwargs):
         super().__init__(**layer_kwargs(kwargs))
@@ -17,12 +17,7 @@ def call(self, inputs: Tensor) -> Tensor:
         L = fill_triangular_matrix(inputs)
         L = positive_diag(L)
 
-        # calculate positive definite matrix from cholesky factors:
-        psd = keras.ops.matmul(
-            L,
-            keras.ops.swapaxes(L, -2, -1),  # L transposed
-        )
-        return psd
+        return L
 
     def compute_output_shape(self, input_shape):
         m = input_shape[-1]
diff --git a/bayesflow/scores/multivariate_normal_score.py b/bayesflow/scores/multivariate_normal_score.py
@@ -3,7 +3,7 @@
 import keras
 
 from bayesflow.types import Shape, Tensor
-from bayesflow.links import PositiveDefinite
+from bayesflow.links import CholeskyFactor
 from bayesflow.utils.serialization import serializable
 
 from .parametric_distribution_score import ParametricDistributionScore
@@ -13,26 +13,27 @@
 class MultivariateNormalScore(ParametricDistributionScore):
     r""":math:`S(\hat p_{\mu, \Sigma}, \theta; k) = -\log( \mathcal N (\theta; \mu, \Sigma))`
 
-    Scores a predicted mean and covariance matrix with the log-score of the probability of the materialized value.
+    Scores a predicted mean and (Cholesky factor of the) covariance matrix with the log-score of the probability
+    of the materialized value.
     """
 
-    NOT_TRANSFORMING_LIKE_VECTOR_WARNING = ("covariance",)
+    NOT_TRANSFORMING_LIKE_VECTOR_WARNING = ("cov_chol",)
     """
-    Marks head for covariance matrix as an exception for adapter transformations.
+    Marks head for covariance matrix Cholesky factor as an exception for adapter transformations.
 
     This variable contains names of prediction heads that should lead to a warning when the adapter is applied
     in inverse direction to them.
 
     For more information see :py:class:`ScoringRule`.
     """
 
-    TRANSFORMATION_TYPE: dict[str, str] = {"covariance": "both_sides_scale"}
+    TRANSFORMATION_TYPE: dict[str, str] = {"cov_chol": "left_side_scale"}
     """
-    Marks covariance head to handle de-standardization as for covariant rank-(0,2) tensors.
+    Marks covariance Cholesky factor head to handle de-standardization as for covariant rank-(0,2) tensors.
 
     The appropriate inverse of the standardization operation is
 
-    x_ij = x_ij' * sigma_i * sigma_j.
+    x_ij = sigma_i * x_ij'.
 
     For the mean head the default ("location_scale") is not overridden.
     """
@@ -41,7 +42,7 @@ def __init__(self, dim: int = None, links: dict = None, **kwargs):
         super().__init__(links=links, **kwargs)
 
         self.dim = dim
-        self.links = links or {"covariance": PositiveDefinite()}
+        self.links = links or {"cov_chol": CholeskyFactor()}
 
         self.config = {"dim": dim}
 
@@ -51,14 +52,14 @@ def get_config(self):
 
     def get_head_shapes_from_target_shape(self, target_shape: Shape) -> dict[str, Shape]:
         self.dim = target_shape[-1]
-        return dict(mean=(self.dim,), covariance=(self.dim, self.dim))
+        return dict(mean=(self.dim,), cov_chol=(self.dim, self.dim))
 
-    def log_prob(self, x: Tensor, mean: Tensor, covariance: Tensor) -> Tensor:
+    def log_prob(self, x: Tensor, mean: Tensor, cov_chol: Tensor) -> Tensor:
         """
         Compute the log probability density of a multivariate Gaussian distribution.
 
         This function calculates the log probability density for each sample in `x` under a
-        multivariate Gaussian distribution with the given `mean` and `covariance`.
+        multivariate Gaussian distribution with the given `mean` and `cov_chol`.
 
         The computation includes the determinant of the covariance matrix, its inverse, and the quadratic
         form in the exponential term of the Gaussian density function.
@@ -80,6 +81,12 @@ def log_prob(self, x: Tensor, mean: Tensor, covariance: Tensor) -> Tensor:
             given Gaussian distribution.
         """
         diff = x - mean
+
+        # Calculate covariance from Cholesky factors
+        covariance = keras.ops.matmul(
+            cov_chol,
+            keras.ops.swapaxes(cov_chol, -2, -1),
+        )
         precision = keras.ops.inv(covariance)
         log_det_covariance = keras.ops.slogdet(covariance)[1]  # Only take the log of the determinant part
 
@@ -91,14 +98,12 @@ def log_prob(self, x: Tensor, mean: Tensor, covariance: Tensor) -> Tensor:
 
         return log_prob
 
-    def sample(self, batch_shape: Shape, mean: Tensor, covariance: Tensor) -> Tensor:
+    def sample(self, batch_shape: Shape, mean: Tensor, cov_chol: Tensor) -> Tensor:
         """
         Generate samples from a multivariate Gaussian distribution.
 
-        This function samples from a multivariate Gaussian distribution with the given `mean`
-        and `covariance` using the Cholesky decomposition method. Independent standard normal
-        samples are transformed using the Cholesky factor of the covariance matrix to generate
-        correlated samples.
+        Independent standard normal samples are transformed using the Cholesky factor of the covariance matrix
+        to generate correlated samples.
 
         Parameters
         ----------
@@ -107,8 +112,8 @@ def sample(self, batch_shape: Shape, mean: Tensor, covariance: Tensor) -> Tensor
         mean : Tensor
             A tensor representing the mean of the multivariate Gaussian distribution.
             Must have shape (batch_size, D), where D is the dimensionality of the distribution.
-        covariance : Tensor
-            A tensor representing the covariance matrix of the multivariate Gaussian distribution.
+        cov_chol : Tensor
+            A tensor representing a Cholesky factor of the covariance matrix of the multivariate Gaussian distribution.
             Must have shape (batch_size, D, D), where D is the dimensionality.
 
         Returns
@@ -123,16 +128,16 @@ def sample(self, batch_shape: Shape, mean: Tensor, covariance: Tensor) -> Tensor
         if keras.ops.shape(mean) != (batch_size, dim):
             raise ValueError(f"mean must have shape (batch_size, {dim}), but got {keras.ops.shape(mean)}")
 
-        if keras.ops.shape(covariance) != (batch_size, dim, dim):
+        if keras.ops.shape(cov_chol) != (batch_size, dim, dim):
             raise ValueError(
-                f"covariance must have shape (batch_size, {dim}, {dim}), but got {keras.ops.shape(covariance)}"
+                f"covariance Cholesky factor must have shape (batch_size, {dim}, {dim}),"
+                f"but got {keras.ops.shape(cov_chol)}"
             )
 
         # Use Cholesky decomposition to generate samples
-        cholesky_factor = keras.ops.cholesky(covariance)
         normal_samples = keras.random.normal((*batch_shape, dim))
 
-        scaled_normal = keras.ops.einsum("ijk,ilk->ilj", cholesky_factor, normal_samples)
+        scaled_normal = keras.ops.einsum("ijk,ilk->ilj", cov_chol, normal_samples)
         samples = mean[:, None, :] + scaled_normal
 
         return samples
diff --git a/tests/test_links/conftest.py b/tests/test_links/conftest.py
@@ -33,18 +33,18 @@ def ordered_quantiles():
 
 
 @pytest.fixture()
-def positive_definite():
-    from bayesflow.links import PositiveDefinite
+def cholesky_factor():
+    from bayesflow.links import CholeskyFactor
 
-    return PositiveDefinite()
+    return CholeskyFactor()
 
 
 @pytest.fixture()
 def linear():
     return keras.layers.Activation("linear")
 
 
-@pytest.fixture(params=["ordered", "ordered_quantiles", "positive_definite", "linear"], scope="function")
+@pytest.fixture(params=["ordered", "ordered_quantiles", "cholesky_factor", "linear"], scope="function")
 def link(request):
     return request.getfixturevalue(request.param)
 
diff --git a/tests/test_links/test_links.py b/tests/test_links/test_links.py
@@ -52,21 +52,20 @@ def test_quantile_ordering(quantiles, unordered):
     check_ordering(output, axis)
 
 
-def test_positive_definite(positive_definite, batch_size, num_variables):
-    input_shape = positive_definite.compute_input_shape((batch_size, num_variables, num_variables))
+def test_cholesky_factor(cholesky_factor, batch_size, num_variables):
+    input_shape = cholesky_factor.compute_input_shape((batch_size, num_variables, num_variables))
 
-    # Too strongly negative values lead to numerical instabilities -> reduce scale
-    random_preactivation = keras.random.normal(input_shape) * 0.1
-    output = positive_definite(random_preactivation)
-    output = keras.ops.convert_to_numpy(output)
-
-    # Check if output is invertible
-    np.linalg.inv(output)
+    random_preactivation = keras.random.normal(input_shape)
 
-    # Calculated eigenvalues to test for positive definiteness
-    eigenvalues = np.linalg.eig(output).eigenvalues
+    output = cholesky_factor(random_preactivation)
+    output = keras.ops.convert_to_numpy(output)
 
-    assert np.all(eigenvalues.real > 0) and np.all(np.isclose(eigenvalues.imag, 0)), (
-        f"output is not positive definite: min(real)={np.min(eigenvalues.real)}, "
-        f"max(abs(imag))={np.max(np.abs(eigenvalues.imag))}"
+    np.testing.assert_allclose(
+        np.triu(output, k=1),
+        np.zeros((batch_size, num_variables, num_variables)),
+        atol=1e-4,
+        err_msg=f"All elements above diagonal must be zero for lower triangular matrix: {output}",
     )
+
+    diag = np.diagonal(output, axis1=1, axis2=2)
+    assert np.all(diag > 0), f"diagonal is not strictly positive: {diag}"