handley-lab · Stefan-Heimersheim · Mar 22, 2022 · Mar 22, 2022 · Mar 22, 2022 · Mar 22, 2022
diff --git a/README.rst b/README.rst
@@ -2,7 +2,7 @@
 anesthetic: nested sampling post-processing
 ===========================================
 :Authors: Will Handley and Lukas Hergt
-:Version: 2.1.2
+:Version: 2.2.0
 :Homepage: https://github.com/handley-lab/anesthetic
 :Documentation: http://anesthetic.readthedocs.io/
 

diff --git a/anesthetic/_version.py b/anesthetic/_version.py
@@ -1 +1 @@
-__version__ = '2.1.2'
+__version__ = '2.2.0'
diff --git a/anesthetic/utils.py b/anesthetic/utils.py
@@ -3,6 +3,7 @@
 import pandas
 from scipy import special
 from scipy.interpolate import interp1d
+from scipy.optimize import minimize_scalar
 from scipy.stats import kstwobign, entropy
 from matplotlib.tri import Triangulation
 import contextlib
@@ -151,6 +152,132 @@ def quantile(a, q, w=None, interpolation='linear'):
     return quant
 
 
+def sample_cdf(samples, inverse=False, interpolation='linear'):
+    """Sample the empirical cdf for a 1d array."""
+    samples = np.sort(samples)
+    ngaps = len(samples)-1
+    gaps = np.random.dirichlet(np.ones(ngaps))
+    cdf = np.array([0, *np.cumsum(gaps)])
+    assert np.isclose(cdf[-1], 1, atol=1e-9, rtol=1e-9), \
+        "Error: CDF does not reach 1 but "+str(cdf[-1])
+    # Set the last element (tested to be approx 1)
+    # to exactly 1 to avoid interpolation errors
+    cdf[-1] = 1
+    if inverse:
+        return interp1d(cdf, samples, kind=interpolation)
+    else:
+        return interp1d(samples, cdf, kind=interpolation,
+                        fill_value=(0, 1), bounds_error=False)
+
+
+def credibility_interval(samples, weights=None, level=0.68, method="iso-pdf",
+                         return_covariance=False, nsamples=12):
+    """Compute the credibility interval of weighted samples.
+
+    Based on linear interpolation of the cumulative density function, thus
+    expect discretisation errors on the scale of distances between samples.
+
+    https://github.com/Stefan-Heimersheim/fastCI#readme
+
+    Parameters
+    ----------
+    samples : array
+        Samples to compute the credibility interval of.
+    weights : array, default=np.ones_like(samples)
+        Weights corresponding to samples.
+    level : float, default=0.68
+        Credibility level (probability, <1).
+    method : str, default='iso-pdf'
+        Which definition of interval to use:
+
+        * ``'iso-pdf'``: Calculate iso probability density interval with the
+          same probability density at each end. Also known as
+          waterline-interval or highest average posterior density interval.
+          This is only accurate if the distribution is sufficiently uni-modal.
+        * ``'lower-limit'``/``'upper-limit'``: Lower/upper limit. One-sided
+          limits for which ``level`` fraction of the (equally weighted) samples
+          lie above/below the limit.
+        * ``'equal-tailed'``: Equal-tailed interval with the same fraction of
+          (equally weighted) samples below and above the interval region.
+
+    return_covariance: bool, default=False
+        Return the covariance of the sampled limits, in addition to the mean
+    nsamples : int, default=12
+        Number of CDF samples to improve `mean` and `std` estimate.
+
+    Returns
+    -------
+    limit(s) : float, array, or tuple of floats or arrays
+        Returns the credibility interval boundari(es). By default,
+        returns the mean over ``nsamples`` samples, which is either
+        two numbers (``method='iso-pdf'``/``'equal-tailed'``) or one number
+        (``method='lower-limit'``/``'upper-limit'``). If
+        ``return_covariance=True``, returns a tuple (mean(s), covariance)
+        where covariance is the covariance over the sampled limits.
+    """
+    if level >= 1:
+        raise ValueError('level must be <1, got {0:.2f}'.format(level))
+    if len(np.shape(samples)) != 1:
+        raise ValueError('Support only 1D arrays for samples')
+    if weights is not None and np.shape(samples) != np.shape(weights):
+        raise ValueError('Shape of samples and weights differs')
+
+    # Convert to numpy to unify indexing
+    samples = np.array(samples.copy())
+    if weights is None:
+        weights = np.ones(len(samples))
+    else:
+        weights = np.array(weights.copy())
+
+    # Convert samples to unit weight not the case
+    if not np.all(np.logical_or(weights == 0, weights == 1)):
+        # compress_weights with ncompress='equal' assures weights \in 0,1
+        # Note that this must be done, we cannot handle weights != 1
+        # see this discussion for details:
+        # https://github.com/williamjameshandley/anesthetic/pull/188#issuecomment-1274980982
+        weights = compress_weights(weights, ncompress='equal')
+
+    indices = np.where(weights)[0]
+    x = samples[indices]
+
+    # Sample the confidence interval multiple times
+    # to get errorbars on confidence interval boundaries
+    ci_samples = []
+    for i in range(nsamples):
+        invCDF = sample_cdf(x, inverse=True)
+        if method == 'iso-pdf':
+            # Find smallest interval
+            def distance(Y, level=level):
+                return invCDF(Y+level)-invCDF(Y)
+            res = minimize_scalar(distance, bounds=(0, 1-level),
+                                  method="Bounded")
+            ci_samples.append(np.array([invCDF(res.x),
+                                        invCDF(res.x+level)]))
+        elif method == 'lower-limit':
+            # Get value from which we reach the desired level
+            ci_samples.append(invCDF(1-level))
+        elif method == 'upper-limit':
+            # Get value to which we reach the desired level
+            ci_samples.append(invCDF(level))
+        elif method == 'equal-tailed':
+            ci_samples.append(np.array([invCDF((1-level)/2),
+                                        invCDF((1+level)/2)]))
+        else:
+            raise ValueError("Method '{0:}' unknown".format(method))
+    ci_samples = np.array(ci_samples)
+    if np.shape(ci_samples) == (nsamples, ):
+        if return_covariance:
+            return np.mean(ci_samples), np.cov(ci_samples)
+        else:
+            return np.mean(ci_samples)
+    else:
+        if return_covariance:
+            return np.mean(ci_samples, axis=0), \
+                   np.cov(ci_samples, rowvar=False)
+        else:
+            return np.mean(ci_samples, axis=0)
+
+
 def mirror_1d(d, xmin=None, xmax=None):
     """If necessary apply reflecting boundary conditions."""
     if xmin is not None and xmax is not None:

diff --git a/anesthetic/weighted_pandas.py b/anesthetic/weighted_pandas.py
@@ -3,16 +3,17 @@
 import warnings
 from inspect import signature
 import numpy as np
+from numpy.ma import masked_array
 from pandas import Series, DataFrame, concat, MultiIndex
 from pandas.core.groupby import GroupBy, SeriesGroupBy, DataFrameGroupBy, ops
 from pandas._libs import lib
 from pandas._libs.lib import no_default
 from pandas.util._exceptions import find_stack_level
 from pandas.util import hash_pandas_object
-from numpy.ma import masked_array
-from anesthetic.utils import (compress_weights, neff, quantile,
-                              temporary_seed, adjust_docstrings)
 from pandas.core.dtypes.missing import notna
+from anesthetic.utils import (compress_weights, neff, quantile,
+                              temporary_seed, adjust_docstrings,
+                              credibility_interval)
 
 
 class WeightedGroupBy(GroupBy):
@@ -345,6 +346,55 @@ def compress(self, ncompress=True):
     def sample(self, *args, **kwargs):  # noqa: D102
         return super().sample(weights=self.get_weights(), *args, **kwargs)
 
+    def credibility_interval(self, level=0.68, method="iso-pdf",
+                             return_covariance=False, nsamples=12):
+        """Compute the credibility interval of the weighted samples.
+
+        Based on linear interpolation of the cumulative density function, thus
+        expect discretisation errors on the scale of distances between samples.
+
+        https://github.com/Stefan-Heimersheim/fastCI#readme
+
+        Parameters
+        ----------
+        level : float, default=0.68
+            Credibility level (probability, <1).
+        method : str, default='iso-pdf'
+            Which definition of interval to use:
+
+            * ``'iso-pdf'``: Calculate iso probability density interval with
+              the same probability density at each end. Also known as
+              waterline-interval or highest average posterior density interval.
+              This is only accurate if the distribution is sufficiently
+              uni-modal.
+            * ``'lower-limit'``/``'upper-limit'``: Lower/upper limit. One-sided
+              limits for which ``level`` fraction of the (equally weighted)
+              samples lie above/below the limit.
+            * ``'equal-tailed'``: Equal-tailed interval with the same fraction
+              of (equally weighted) samples below and above the interval
+              region.
+
+        return_covariance: bool, default=False
+            Return the covariance of the sampled limits, in addition to the
+            mean
+        nsamples : int, default=12
+            Number of CDF samples to improve `mean` and `std` estimate.
+
+        Returns
+        -------
+        limit(s) : float, array, or tuple of floats or arrays
+            Returns the credibility interval boundaries of the Series.
+            By default, returns the mean over ``nsamples`` samples, which is
+            either two numbers (``method='iso-pdf'``/``'equal-tailed'``) or
+            one number (``method='lower-limit'``/``'upper-limit'``). If
+            ``return_covariance=True``, returns a tuple (mean(s), covariance)
+            where covariance is the covariance over the sampled limits.
+        """
+        return credibility_interval(self, weights=self.get_weights(),
+                                    level=level, method=method,
+                                    return_covariance=return_covariance,
+                                    nsamples=nsamples)
+
     @property
     def _constructor(self):
         return WeightedSeries
@@ -600,6 +650,78 @@ def sample(self, *args, **kwargs):  # noqa: D102
         else:
             return super().sample(*args, **kwargs)
 
+    def credibility_interval(self, level=0.68, method="iso-pdf",
+                             return_covariance=False, nsamples=12):
+        """Compute the credibility interval of the weighted samples.
+
+        Based on linear interpolation of the cumulative density function, thus
+        expect discretisation errors on the scale of distances between samples.
+
+        https://github.com/Stefan-Heimersheim/fastCI#readme
+
+        Parameters
+        ----------
+        level : float, default=0.68
+            Credibility level (probability, <1).
+        method : str, default='iso-pdf'
+            Which definition of interval to use:
+
+            * ``'iso-pdf'``: Calculate iso probability density interval with
+              the same probability density at each end. Also known as
+              waterline-interval or highest average posterior density interval.
+              This is only accurate if the distribution is sufficiently
+              uni-modal.
+            * ``'lower-limit'``/``'upper-limit'``: Lower/upper limit. One-sided
+              limits for which ``level`` fraction of the (equally weighted)
+              samples lie above/below the limit.
+            * ``'equal-tailed'``: Equal-tailed interval with the same fraction
+              of (equally weighted) samples below and above the interval
+              region.
+
+        return_covariance: bool, default=False
+            Return the covariance of the sampled limits, in addition to the
+            mean
+        nsamples : int, default=12
+            Number of CDF samples to improve `mean` and `std` estimate.
+
+        Returns
+        -------
+        limit(s) : float, array, or tuple of floats or arrays
+            Returns the credibility interval boundaries for each column.
+            By default, returns the mean over ``nsamples`` samples, which is
+            either two numbers (``method='iso-pdf'``/``'equal-tailed'``) or
+            one number (``method='lower-limit'``/``'upper-limit'``). If
+            ``return_covariance=True``, returns a tuple (means, covariances)
+            where covariances are the covariance over the sampled limits for
+            each column.
+        """
+        if 'lower' in method:
+            limits = ['lower']
+        elif 'upper' in method:
+            limits = ['upper']
+        else:
+            limits = ['lower', 'upper']
+        cis = [credibility_interval(self[col], weights=self.get_weights(),
+                                    level=level, method=method,
+                                    return_covariance=return_covariance,
+                                    nsamples=nsamples) for col in self.columns]
+        if return_covariance:
+            cis, covs = zip(*cis)
+            mulidx = MultiIndex.from_product([
+                self.columns.get_level_values(level=0),
+                limits
+            ])
+            ncol = len(self.columns)
+            nlim = len(limits)
+            covs = np.asarray(covs).reshape(nlim*ncol, nlim).T
+            covs = DataFrame(covs, index=limits, columns=mulidx)
+        cis = np.atleast_2d(cis) if 'limit' in method else np.asarray(cis).T
+        cis = DataFrame(data=cis, index=limits, columns=self.columns)
+        if return_covariance:
+            return cis, covs
+        else:
+            return cis
+
     @property
     def _constructor_sliced(self):
         return WeightedSeries

diff --git a/tests/test_samples.py b/tests/test_samples.py
@@ -1780,3 +1780,36 @@ def test_axes_limits_2d(kind, kwargs):
     assert 3 < xmax < 3.9
     assert -3.9 < ymin < -3
     assert 3 < ymax < 3.9
+
+
+def test_credibility_interval():
+    np.random.seed(0)
+    pc = read_chains('./tests/example_data/pc')
+
+    ci, cov = pc.x0.credibility_interval(level=0.68,
+                                         method="iso-pdf",
+                                         return_covariance=True)
+    assert ci[0] == pytest.approx(-0.1, rel=0.01, abs=0.01)
+    assert ci[1] == pytest.approx(+0.1, rel=0.01, abs=0.01)
+    assert np.all(np.abs(cov) < 1e-3)
+    assert np.shape(ci) == (2,)
+    assert np.shape(cov) == (2, 2)
+
+    params = ['x0', 'x1']
+    ci, cov = pc[params].credibility_interval(level=0.68,
+                                              method="iso-pdf",
+                                              return_covariance=True)
+    assert_allclose(ci.loc['lower'], -0.1, rtol=0.01, atol=0.01)
+    assert_allclose(ci.loc['upper'], +0.1, rtol=0.01, atol=0.01)
+    assert np.all(np.abs(cov) < 1e-3)
+    assert ci.shape == (2, len(params))
+    assert cov.shape == (2, 2 * len(params))
+    ci, cov = pc[params].credibility_interval(level=0.95+0.025,
+                                              method='lower-limit',
+                                              return_covariance=True)
+    assert_allclose(ci, -0.2, rtol=0.01, atol=0.025)
+    assert np.all(np.abs(cov) < 1e-3)
+    assert cov.shape == (1, len(params))
+    ci = pc[params].credibility_interval(level=0.95+0.025,
+                                         method='upper-limit')
+    assert_allclose(ci, +0.2, rtol=0.01, atol=0.025)