diff --git a/LICENSE b/LICENSE
deleted file mode 100644
index 4a41e03..0000000
--- a/LICENSE
+++ /dev/null
@@ -1,21 +0,0 @@
-MIT License
-
-Copyright (c) 2022 James Foster
-
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-
-The above copyright notice and this permission notice shall be included in all
-copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-SOFTWARE.
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..0cc90ca
--- /dev/null
+++ b/README.md
@@ -0,0 +1,71 @@
+# Markov Chain Cubature for Bayesian Inference
+
+This repository contains code for simulating Markov chains as a cloud of particles using cubature formulae.
+
+The `presentation` folder contains slides describing the methodology.
+
+The `scripts` folder contains the code.
+
+# Examples
+
+## 2D Gaussian Mixture
+
+This will run the cubature algorithm on a 2D Gaussian mixture:
+
+ ```shell
+  python gmm_cubature.py
+ ``` 
+ 
+Outputting something like
+
+ ```shell
+ Time: 18.5 seconds
+Number of particles:  1024
+
+Cubature mean vector
+[-4.13376   -1.8908577]
+
+Cubature covariance matrix
+[[ 2.31224477  1.89332097]
+ [ 1.89332097 10.73603521]]
+
+Actual mean vector
+[-4.148839  -1.8847313]
+
+Actual variances
+[2.2467885 10.953863]
+
+Difference in means
+0.016276047
+```
+
+An illustration of the resulting particle configurations is given below:
+
+<p align="middle">
+    <img src="./img/0.png" width=32%>
+    <img src="./img/25.png" width=32%>
+    <img src="./img/100.png" width=32%>
+</p>
+
+<p align="middle">
+    <img src="./img/250.png" width=32%>
+    <img src="./img/500.png" width=32%>
+    <img src="./img/1000.png" width=32%>
+</p>
+
+ ## Bayesian Logistic Regression	
+ 
+This runs the algorithm for a Bayesian logistic regression on a binary Covertype dataset (see [SVGD paper](https://proceedings.neurips.cc/paper/2016/file/b3ba8f1bee1238a2f37603d90b58898d-Paper.pdf) for details):
+
+ ```shell
+ python bayesian_logisitc_regression_cubature.py
+ ```
+ 
+Outputting something like
+ ```shell
+Time: 1137.0 seconds
+Number of particles:  256
+[accuracy, log-likelihood]
+[0.7573040282953107, -0.5628809048706431]
+ ```
+The content of markov_chain_cubature_presentation.pdf is licensed under the Creative Commons Attribution 4.0 International license, and the Python code is licensed under the MIT license.
\ No newline at end of file
diff --git a/data/README.md b/data/README.md
new file mode 100644
index 0000000..12540a8
--- /dev/null
+++ b/data/README.md
@@ -0,0 +1,3 @@
+We didn't contribute the data sets here. Please let us know if any conflicts of interest. 
+
+The binary Covertype dataset is from [LIBSVM Data Repository](https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary.html) 
\ No newline at end of file
diff --git a/data/covertype.mat b/data/covertype.mat
new file mode 100644
index 0000000..af9bd30
Binary files /dev/null and b/data/covertype.mat differ
diff --git a/img/0.png b/img/0.png
new file mode 100644
index 0000000..fcf8c26
Binary files /dev/null and b/img/0.png differ
diff --git a/img/100.png b/img/100.png
new file mode 100644
index 0000000..dc3945c
Binary files /dev/null and b/img/100.png differ
diff --git a/img/1000.png b/img/1000.png
new file mode 100644
index 0000000..89c2a5a
Binary files /dev/null and b/img/1000.png differ
diff --git a/img/25.png b/img/25.png
new file mode 100644
index 0000000..5ccfd36
Binary files /dev/null and b/img/25.png differ
diff --git a/img/250.png b/img/250.png
new file mode 100644
index 0000000..8df4467
Binary files /dev/null and b/img/250.png differ
diff --git a/img/500.png b/img/500.png
new file mode 100644
index 0000000..683f099
Binary files /dev/null and b/img/500.png differ
diff --git a/presentation/markov_chain_cubature_presentation.pdf b/presentation/markov_chain_cubature_presentation.pdf
new file mode 100644
index 0000000..a632fd5
Binary files /dev/null and b/presentation/markov_chain_cubature_presentation.pdf differ
diff --git a/scripts/bayesian_logisitc_regression_cubature.py b/scripts/bayesian_logisitc_regression_cubature.py
new file mode 100644
index 0000000..4f477d2
--- /dev/null
+++ b/scripts/bayesian_logisitc_regression_cubature.py
@@ -0,0 +1,165 @@
+# This code is adapted from https://github.com/dilinwang820/Stein-Variational-Gradient-Descent/blob/master/python/bayesian_logistic_regression.py
+
+import numpy as np
+import scipy.io
+from sklearn.model_selection import train_test_split
+import numpy.matlib as nm
+import torch
+from scipy.linalg import hadamard
+from sklearn.neighbors import BallTree
+import time
+
+'''
+    Example of Bayesian Logistic Regression (the same setting as Gershman et al. 2012):
+    The observed data D = {X, y} consist of N binary class labels, 
+    y_t \in {-1,+1}, and d covariates for each datapoint, X_t \in R^d.
+    The hidden variables \theta = {w, \alpha} consist of d regression coefficients w_k \in R,
+    and a precision parameter \alpha \in R_+. We assume the following model:
+        p(\alpha) = Gamma(\alpha; a, b)
+        p(w_k | a) = N(w_k; 0, \alpha^-1)
+        p(y_t = 1| x_t, w) = 1 / (1+exp(-w^T x_t))
+'''
+class BayesianLR:
+    def __init__(self, X, Y, batchsize=100, a0=1, b0=0.01):
+        self.X, self.Y = X, Y
+        # TODO. Y in \in{+1, -1}
+        self.batchsize = min(batchsize, X.shape[0])
+        self.a0, self.b0 = a0, b0
+        
+        self.N = X.shape[0]
+        self.permutation = np.random.permutation(self.N)
+        self.iter = 0
+    
+        
+    def dlnprob(self, theta):
+        
+        if self.batchsize > 0:
+            batch = [ i % self.N for i in range(self.iter * self.batchsize, (self.iter + 1) * self.batchsize) ]
+            ridx = self.permutation[batch]
+            self.iter += 1
+        else:
+            ridx = np.random.permutation(self.X.shape[0])
+            
+        Xs = self.X[ridx, :]
+        Ys = self.Y[ridx]
+        
+        w = theta[:, :-1]  # logistic weights
+        alpha = np.exp(theta[:, -1])  # the last column is logalpha
+        d = w.shape[1]
+        
+        wt = np.multiply((alpha / 2), np.sum(w ** 2, axis=1))
+        
+        coff = np.matmul(Xs, w.T)
+        y_hat = 1.0 / (1.0 + np.exp(-1 * coff))
+        
+        dw_data = np.matmul(((nm.repmat(np.vstack(Ys), 1, theta.shape[0]) + 1) / 2.0 - y_hat).T, Xs)  # Y \in {-1,1}
+        dw_prior = -np.multiply(nm.repmat(np.vstack(alpha), 1, d) , w)
+        dw = dw_data * 1.0 * self.X.shape[0] / Xs.shape[0] + dw_prior  # re-scale
+        
+        dalpha = d / 2.0 - wt + (self.a0 - 1) - self.b0 * alpha + 1  # the last term is the jacobian term
+        
+        return torch.Tensor(np.hstack([dw, np.vstack(dalpha)]))  # % first order derivative 
+    
+    def evaluation(self, theta, X_test, y_test):
+        theta = theta[:, :-1]
+        M, n_test = theta.shape[0], len(y_test)
+
+        prob = np.zeros([n_test, M])
+        for t in range(M):
+            coff = np.multiply(y_test, np.sum(-1 * np.multiply(nm.repmat(theta[t, :], n_test, 1), X_test), axis=1))
+            prob[:, t] = np.divide(np.ones(n_test), (1 + np.exp(coff)))
+        
+        prob = np.mean(prob, axis=1)
+        acc = np.mean(prob > 0.5)
+        llh = np.mean(np.log(prob))
+        return [acc, llh]
+
+
+def propagate_measure(measure, gradlnprob, local_cubature, step_size, root_step_size):
+    n = measure.size(0)
+    new_measure = root_step_size*torch.Tensor(local_cubature).repeat(n, 1)
+    gradients = gradlnprob(np.array(measure))
+    
+    # THεO POULA adjustment
+    abs_gradient = torch.abs(gradients)
+
+    gradients = torch.div(gradients, 1.0 + root_step_size*abs_gradient)
+    gradients = gradients + torch.div(root_step_size*gradients, 1.0 + abs_gradient)
+
+    next_points = (measure + gradients*step_size).repeat_interleave(len(local_cubature), dim=0)
+    new_measure += next_points
+    
+    return new_measure
+
+def produce_local_cubature(dimension):
+    max_power = np.ceil(np.log2(dimension))
+    max_dim = int(2**max_power)
+    hadamard_matrix = hadamard(max_dim)
+    if (dimension == max_dim):
+        return np.sqrt(2.0)*np.vstack((hadamard_matrix, -hadamard_matrix))
+    else:
+        new_matrix = hadamard_matrix[:, :dimension]
+        return np.sqrt(2.0)*np.vstack((new_matrix, -new_matrix))
+    
+
+if __name__ == '__main__':
+    data = scipy.io.loadmat('../data/covertype.mat')
+    
+    X_input = data['covtype'][:, 1:]
+    y_input = data['covtype'][:, 0]
+    y_input[y_input == 2] = -1
+    
+    N = X_input.shape[0]
+    X_input = np.hstack([X_input, np.ones([N, 1])])
+    d = X_input.shape[1]
+    dimension = d + 1
+    
+    # split the dataset into training and testing
+    X_train, X_test, y_train, y_test = train_test_split(X_input, y_input, test_size=0.2, random_state=42)
+    
+    a0, b0 = 1, 0.01 #hyperparameters
+    model = BayesianLR(X_train, y_train, 100, a0, b0) # batchsize = 100
+    
+    # initialization
+    number_of_points = 1024
+    cubature_measure = np.zeros([number_of_points, dimension]);
+    alpha0 = np.random.gamma(a0, b0, number_of_points);
+
+    for i in range(number_of_points):
+        cubature_measure[i, :] = np.hstack([np.random.normal(0, np.sqrt(1 / alpha0[i]), d), np.log(alpha0[i])])
+    
+    cubature_measure = torch.Tensor(cubature_measure)
+    local_cubature = produce_local_cubature(dimension)
+    
+    points_per_recombine = 128
+    number_of_halving = 4
+    number_of_steps = 1500
+    step_size = 0.01
+    root_step_size = np.sqrt(step_size)
+    
+    tic = time.time()
+    
+    for i in range(number_of_steps):
+        # Propagate cubature measure
+        big_measure = propagate_measure(cubature_measure, model.dlnprob, local_cubature, step_size, root_step_size)
+        
+        # Partition particles into subsets using a ball tree
+        ball_tree = BallTree(big_measure, leaf_size=0.5*points_per_recombine)
+        _, indices, nodes, _ = ball_tree.get_arrays()
+        
+        # Resample particles
+        new_indices = [np.random.choice(indices[nd[0]: nd[1]]) for nd in nodes if nd[2]]    
+        cubature_measure = torch.Tensor(big_measure[new_indices])
+        
+        if (i+1) % 100 == 0:
+            print('iter ' + str(i+1))
+            
+    toc = time.time()   
+
+    print("Time: %.1f seconds" % (toc - tic))
+
+    print("Number of particles: ", len(cubature_measure))
+    
+    print("[accuracy, log-likelihood]")
+    print(model.evaluation(cubature_measure, X_test, y_test))
+    
\ No newline at end of file
diff --git a/scripts/bayesian_logistic_regression_svgd.py b/scripts/bayesian_logistic_regression_svgd.py
new file mode 100644
index 0000000..042044f
--- /dev/null
+++ b/scripts/bayesian_logistic_regression_svgd.py
@@ -0,0 +1,109 @@
+# This code is taken from https://github.com/dilinwang820/Stein-Variational-Gradient-Descent/blob/master/python/bayesian_logistic_regression.py
+
+import numpy as np
+import scipy.io
+from sklearn.model_selection import train_test_split
+import numpy.matlib as nm
+from svgd import SVGD
+
+import time
+
+'''
+    Example of Bayesian Logistic Regression (the same setting as Gershman et al. 2012):
+    The observed data D = {X, y} consist of N binary class labels, 
+    y_t \in {-1,+1}, and d covariates for each datapoint, X_t \in R^d.
+    The hidden variables \theta = {w, \alpha} consist of d regression coefficients w_k \in R,
+    and a precision parameter \alpha \in R_+. We assume the following model:
+        p(\alpha) = Gamma(\alpha; a, b)
+        p(w_k | a) = N(w_k; 0, \alpha^-1)
+        p(y_t = 1| x_t, w) = 1 / (1+exp(-w^T x_t))
+'''
+class BayesianLR:
+    def __init__(self, X, Y, batchsize=100, a0=1, b0=0.01):
+        self.X, self.Y = X, Y
+        # TODO. Y in \in{+1, -1}
+        self.batchsize = min(batchsize, X.shape[0])
+        self.a0, self.b0 = a0, b0
+        
+        self.N = X.shape[0]
+        self.permutation = np.random.permutation(self.N)
+        self.iter = 0
+    
+        
+    def dlnprob(self, theta):
+        
+        if self.batchsize > 0:
+            batch = [ i % self.N for i in range(self.iter * self.batchsize, (self.iter + 1) * self.batchsize) ]
+            ridx = self.permutation[batch]
+            self.iter += 1
+        else:
+            ridx = np.random.permutation(self.X.shape[0])
+            
+        Xs = self.X[ridx, :]
+        Ys = self.Y[ridx]
+        
+        w = theta[:, :-1]  # logistic weights
+        alpha = np.exp(theta[:, -1])  # the last column is logalpha
+        d = w.shape[1]
+        
+        wt = np.multiply((alpha / 2), np.sum(w ** 2, axis=1))
+        
+        coff = np.matmul(Xs, w.T)
+        y_hat = 1.0 / (1.0 + np.exp(-1 * coff))
+        
+        dw_data = np.matmul(((nm.repmat(np.vstack(Ys), 1, theta.shape[0]) + 1) / 2.0 - y_hat).T, Xs)  # Y \in {-1,1}
+        dw_prior = -np.multiply(nm.repmat(np.vstack(alpha), 1, d) , w)
+        dw = dw_data * 1.0 * self.X.shape[0] / Xs.shape[0] + dw_prior  # re-scale
+        
+        dalpha = d / 2.0 - wt + (self.a0 - 1) - self.b0 * alpha + 1  # the last term is the jacobian term
+        
+        return np.hstack([dw, np.vstack(dalpha)])  # % first order derivative 
+    
+    def evaluation(self, theta, X_test, y_test):
+        theta = theta[:, :-1]
+        M, n_test = theta.shape[0], len(y_test)
+
+        prob = np.zeros([n_test, M])
+        for t in range(M):
+            coff = np.multiply(y_test, np.sum(-1 * np.multiply(nm.repmat(theta[t, :], n_test, 1), X_test), axis=1))
+            prob[:, t] = np.divide(np.ones(n_test), (1 + np.exp(coff)))
+        
+        prob = np.mean(prob, axis=1)
+        acc = np.mean(prob > 0.5)
+        llh = np.mean(np.log(prob))
+        return [acc, llh]
+
+if __name__ == '__main__':
+    data = scipy.io.loadmat('../data/covertype.mat')
+    
+    X_input = data['covtype'][:, 1:]
+    y_input = data['covtype'][:, 0]
+    y_input[y_input == 2] = -1
+    
+    N = X_input.shape[0]
+    X_input = np.hstack([X_input, np.ones([N, 1])])
+    d = X_input.shape[1]
+    D = d + 1
+    
+    # split the dataset into training and testing
+    X_train, X_test, y_train, y_test = train_test_split(X_input, y_input, test_size=0.2, random_state=42)
+    
+    a0, b0 = 1, 0.01 #hyper-parameters
+    model = BayesianLR(X_train, y_train, 100, a0, b0) # batchsize = 100
+    
+    # initialization
+    M = 256  # number of particles
+    theta0 = np.zeros([M, D]);
+    alpha0 = np.random.gamma(a0, b0, M); 
+    for i in range(M):
+        theta0[i, :] = np.hstack([np.random.normal(0, np.sqrt(1 / alpha0[i]), d), np.log(alpha0[i])])
+    
+    tic = time.time()
+    theta = SVGD().update(x0=theta0, lnprob=model.dlnprob, bandwidth=-1, n_iter=6000, stepsize=0.05, alpha=0.9, debug=True)
+
+    toc = time.time()
+    
+    print("Time: %.1f seconds" % (toc - tic))
+    
+    print('[accuracy, log-likelihood]')
+    print(model.evaluation(theta, X_test, y_test))
diff --git a/scripts/gmm.py b/scripts/gmm.py
new file mode 100644
index 0000000..c409964
--- /dev/null
+++ b/scripts/gmm.py
@@ -0,0 +1,148 @@
+import torch
+from torch import distributions as D
+from torch.nn import functional as F
+
+
+class MixtureSameFamily(D.Distribution):
+    """ Mixture (same-family) distribution.
+
+    The `MixtureSameFamily` distribution implements a (batch of) mixture
+    distribution where all components are from different parameterizations of
+    the same distribution type. It is parameterized by a `Categorical`
+    "selecting distribution" (over `k` components) and a components
+    distribution, i.e., a `Distribution` with a rightmost batch shape
+    (equal to `[k]`) which indexes each (batch of) component.
+    """
+
+    def __init__(self,
+                 mixture_distribution,
+                 components_distribution,
+                 validate_args=None):
+        """ Construct a 'MixtureSameFamily' distribution
+
+        Args::
+            mixture_distribution: `torch.distributions.Categorical`-like
+                instance. Manages the probability of selecting components.
+                The number of categories must match the rightmost batch
+                dimension of the `components_distribution`. Must have either
+                scalar `batch_shape` or `batch_shape` matching
+                `components_distribution.batch_shape[:-1]`
+            components_distribution: `torch.distributions.Distribution`-like
+                instance. Right-most batch dimension indexes components.
+
+        Examples::
+            # Construct Gaussian Mixture Model in 1D consisting of 5 equally
+            # weighted normal distributions
+            >>> mix = D.Categorical(torch.ones(5,))
+            >>> comp = D.Normal(torch.randn(5,), torch.rand(5,))
+            >>> gmm = MixtureSameFamily(mix, comp)
+
+            # Construct Gaussian Mixture Modle in 2D consisting of 5 equally
+            # weighted bivariate normal distributions
+            >>> mix = D.Categorical(torch.ones(5,))
+            >>> comp = D.Independent(D.Normal(
+                    torch.randn(5,2), torch.rand(5,2)), 1)
+            >>> gmm = MixtureSameFamily(mix, comp)
+
+            # Construct a batch of 3 Gaussian Mixture Models in 2D each
+            # consisting of 5 random weighted bivariate normal distributions
+            >>> mix = D.Categorical(torch.rand(3,5))
+            >>> comp = D.Independent(D.Normal(
+                    torch.randn(3,5,2), torch.rand(3,5,2)), 1)
+            >>> gmm = MixtureSameFamily(mix, comp)
+
+        """
+        self._mixture_distribution = mixture_distribution
+        self._components_distribution = components_distribution
+
+        if not isinstance(self._mixture_distribution, D.Categorical):
+            raise ValueError(" The Mixture distribution needs to be an "
+                             " instance of torch.distribtutions.Categorical")
+
+        if not isinstance(self._components_distribution, D.Distribution):
+            raise ValueError("The Component distribution need to be an "
+                             "instance of torch.distributions.Distribution")
+
+        # Check that batch size matches
+        mdbs = self._mixture_distribution.batch_shape
+        cdbs = self._components_distribution.batch_shape[:-1]
+        if len(mdbs) != 0 and mdbs != cdbs:
+            raise ValueError("`mixture_distribution.batch_shape` ({0}) is not "
+                             "compatible with `components_distribution."
+                             "batch_shape`({1})".format(mdbs, cdbs))
+
+        # Check that the number of mixture components matches
+        km = self._mixture_distribution.logits.shape[-1]
+        kc = self._components_distribution.batch_shape[-1]
+        if km is not None and kc is not None and km != kc:
+            raise ValueError("`mixture_distribution components` ({0}) does not"
+                             " equal `components_distribution.batch_shape[-1]`"
+                             " ({1})".format(km, kc))
+        self._num_components = km
+
+        event_shape = self._components_distribution.event_shape
+        self._event_ndims = len(event_shape)
+        super(MixtureSameFamily, self).__init__(batch_shape=cdbs,
+                                                event_shape=event_shape,
+                                                validate_args=validate_args)
+
+    @property
+    def mixture_distribution(self):
+        return self._mixture_distribution
+
+    @property
+    def components_distribution(self):
+        return self._components_distribution
+
+    @property
+    def mean(self):
+        probs = self._pad_mixture_dimensions(self.mixture_distribution.probs)
+        return torch.sum(probs * self.components_distribution.mean,
+                         dim=-1-self._event_ndims)  # [B, E]
+
+    @property
+    def variance(self):
+        # Law of total variance: Var(Y) = E[Var(Y|X)] + Var(E[Y|X])
+        probs = self._pad_mixture_dimensions(self.mixture_distribution.probs)
+        mean_cond_var = torch.sum(probs*self.components_distribution.variance,
+                                  dim=-1-self._event_ndims)
+        var_cond_mean = torch.sum(probs * (self.components_distribution.mean -
+                                           self._pad(self.mean)).pow(2.0),
+                                  dim=-1-self._event_ndims)
+        return mean_cond_var + var_cond_mean
+
+    def log_prob(self, x):
+        x = self._pad(x)
+        log_prob_x = self.components_distribution.log_prob(x)  # [S, B, k]
+        log_mix_prob = torch.log_softmax(self.mixture_distribution.logits,
+                                         dim=-1)  # [B, k]
+        return torch.logsumexp(log_prob_x + log_mix_prob, dim=-1)  # [S, B]
+
+    def sample(self, sample_shape=torch.Size()):
+        with torch.no_grad():
+            # [n, B]
+            mix_sample = self.mixture_distribution.sample(sample_shape)
+            # [n, B, k, E]
+            comp_sample = self.components_distribution.sample(sample_shape)
+            # [n, B, k]
+            mask = F.one_hot(mix_sample, self._num_components)
+            # [n, B, k, [1]*E]
+            mask = self._pad_mixture_dimensions(mask)
+            return torch.sum(comp_sample * mask.float(),
+                             dim=-1-self._event_ndims)
+
+    def _pad(self, x):
+        d = len(x.shape) - self._event_ndims
+        s = x.shape
+        x = x.reshape(*s[:d], 1, *s[d:])
+        return x
+
+    def _pad_mixture_dimensions(self, x):
+        dist_batch_ndims = self.batch_shape.numel()
+        cat_batch_ndims = self.mixture_distribution.batch_shape.numel()
+        pad_ndims = 0 if cat_batch_ndims == 1 else \
+            dist_batch_ndims - cat_batch_ndims
+        s = x.shape
+        x = torch.reshape(x, shape=(*s[:-1], *(pad_ndims*[1]),
+                                    *s[-1:], *(self._event_ndims*[1])))
+        return x
\ No newline at end of file
diff --git a/scripts/gmm_cubature.py b/scripts/gmm_cubature.py
new file mode 100644
index 0000000..70271a1
--- /dev/null
+++ b/scripts/gmm_cubature.py
@@ -0,0 +1,160 @@
+import numpy as np
+import pandas as pd
+import matplotlib.pyplot as plt
+plt.rc('text', usetex=True)
+plt.rc('text.latex', preamble=r'\usepackage{amsmath}')
+from sklearn.neighbors import BallTree
+from gmm import MixtureSameFamily
+import torch
+import torch.autograd as autograd
+from scipy.linalg import hadamard
+import argparse
+import time
+
+def produce_local_cubature(dimension):
+    max_power = np.ceil(np.log2(dimension))
+    max_dim = int(2**max_power)
+    hadamard_matrix = hadamard(max_dim)
+    if (dimension == max_dim):
+        return np.vstack((hadamard_matrix, -hadamard_matrix))
+    else:
+        new_matrix = hadamard_matrix[:, :dimension]
+        return np.vstack((new_matrix, -new_matrix))
+
+def grad_log_prob(P, X):
+    X = X.detach().requires_grad_(True)
+
+    log_prob = P.log_prob(X)
+
+    return autograd.grad(log_prob.sum(), X)[0]
+
+def propagate_measure(measure, target_dist, local_cubature, step_size, root_two_step_size):
+    n = measure.size(0)
+    new_measure = root_two_step_size*torch.Tensor(local_cubature).repeat(n, 1)
+    X = measure.detach().requires_grad_()
+    target_dist.log_prob(X).backward(torch.ones(n))
+    grads = X.grad
+    next_points = (measure + grads*step_size).repeat_interleave(len(local_cubature), dim=0)
+    new_measure += next_points
+    return new_measure
+
+def plot_output(P, X, d=7.0, step=0.1, save=False, i=None):
+    xv, yv = torch.meshgrid([
+        torch.arange(-(d+2.0), (d-2.0), step),
+        torch.arange(-(d+2.0), d, step)
+    ], indexing="ij")
+    pos_xy = torch.cat((xv.unsqueeze(-1), yv.unsqueeze(-1)), dim=-1)
+    p_xy = P.log_prob(pos_xy.to(device)).exp().unsqueeze(-1).cpu()
+
+    df = torch.cat([pos_xy, p_xy], dim=-1).numpy()
+    df = pd.DataFrame({
+        'x': df[:, :, 0].ravel(),
+        'y': df[:, :, 1].ravel(),
+        'p': df[:, :, 2].ravel(),
+    })
+
+    mu_hat = cubature_measure.mean(0)
+    S = np.cov(cubature_measure.T, bias=True)
+    mu = P.mean.numpy()
+    Sigma = P.variance.numpy()
+
+    mnorm = np.linalg.norm(mu_hat - mu)
+    Snorm = np.linalg.norm(np.diag(S) - Sigma)
+
+    fig1, ax2 = plt.subplots(constrained_layout=True)
+    ax2.scatter(X[:, 0], X[:, 1], c='r', s=5.)
+    ax2.set_aspect("equal")
+    plt.title(r'Iter %d; $\lVert \mu - \widehat{\mu} \rVert = %.3f; \lVert \Sigma - \widehat{\Sigma} \rVert = %.3f$' % (i, mnorm, Snorm))
+    if save and i is not None:
+        plt.savefig("../img/%d.png" % i)
+        plt.close()
+    else:
+        plt.show()
+        
+        
+if __name__ == "__main__":
+    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+    parser = argparse.ArgumentParser(
+        description="Run cubature-based Langevin algorithm on a 2D Gaussian mixture model")
+    parser.add_argument("--stsize", type=float, default=.1, help="step size")
+    parser.add_argument("--nsteps", type=int, default=1000,
+                        help="number of steps")
+    parser.add_argument("--npoints", type=int, default=1000,
+                        help="number of points (power of two)")
+    args = parser.parse_args()
+
+    step_size = args.stsize
+    dimension = 2
+    number_of_steps = args.nsteps
+    number_of_points = args.npoints
+    no_of_recombines = number_of_points
+    number_of_halving = int(np.log2(dimension)) + 1
+    log_no_of_recombines = int(np.log2(no_of_recombines))
+    no_of_recombines = 2**log_no_of_recombines
+
+    if dimension != int(2**(number_of_halving - 1)):
+        number_of_halving = int(np.log2(dimension)) + 2
+
+    root_two_step_size = np.sqrt(2.0*step_size)
+    points_per_recombine = (number_of_points * pow(2, number_of_halving))/no_of_recombines
+    
+    """
+    Set up initial measure and target
+    """
+    np.random.seed(1337)
+    torch.manual_seed(1337)
+
+    mix = torch.distributions.Categorical(torch.Tensor([.2, .5, .3]))
+    comp = torch.distributions.Independent(torch.distributions.Normal(3.*torch.randn(3,2), torch.Tensor([[1.5, .6], [1., 1.], [1.1, 1.6]])), 1)
+    measure = MixtureSameFamily(mix, comp, validate_args=False)
+
+    prior_mean = torch.Tensor([4.0] * dimension).to(device)
+    prior_cov = torch.Tensor([1.0] * dimension).diag().to(device)
+    prior = torch.distributions.MultivariateNormal(
+        prior_mean, covariance_matrix=prior_cov)
+
+    prior_samples = prior.sample(torch.Size([number_of_points])).to(device)
+
+    cubature_measure = prior_samples.clone()
+
+    local_cubature = produce_local_cubature(dimension)
+
+    tic = time.time()
+    for i in range(number_of_steps):
+        #if i % 25 == 0:
+        #    plot_output(measure, cubature_measure, d=10., step=.1, save=True, i=i)
+        
+        # Propagate cubature measure
+        big_measure = propagate_measure(cubature_measure, measure, local_cubature, step_size, root_two_step_size)
+            
+        # Partition particles into subsets using a ball tree
+        ball_tree = BallTree(big_measure, leaf_size=0.5*points_per_recombine)
+        _, indices, nodes, _ = ball_tree.get_arrays()
+
+        # Resample particles
+        new_indices = [np.random.choice(indices[nd[0]: nd[1]]) for nd in nodes if nd[2]] 
+
+        cubature_measure = torch.Tensor(big_measure[new_indices])
+         
+    toc = time.time()
+    print("Time: %.1f seconds" % (toc - tic))
+    #plot_output(measure, cubature_measure, 10.0, .1, save=True, i=1000)
+
+    cubature_measure = np.array(cubature_measure)
+
+    print("Number of particles: ", len(cubature_measure))
+    print("")
+    print("Cubature mean vector")
+    print(cubature_measure.mean(0))
+    print("")
+    print("Cubature covariance matrix")
+    print(np.cov(cubature_measure.T, bias=True))
+    print("")
+    print("Actual mean vector")
+    print(np.array(measure.mean))
+    print("")
+    print("Actual variances")
+    print(np.array(measure.variance))
+    print("")
+    print("Difference in means")
+    print(np.linalg.norm(cubature_measure.mean(0) - np.array(measure.mean))) 
\ No newline at end of file
diff --git a/scripts/gmm_mcmc.py b/scripts/gmm_mcmc.py
new file mode 100644
index 0000000..d33c00b
--- /dev/null
+++ b/scripts/gmm_mcmc.py
@@ -0,0 +1,120 @@
+import matplotlib.pyplot as plt
+plt.rc('text', usetex=True)
+plt.rc('text.latex', preamble=r'\usepackage{amsmath}')
+
+from gmm import MixtureSameFamily
+import torch
+import numpy as np
+import time
+import pandas as pd
+import argparse
+
+def plot_output(P, X, d=7.0, step=0.1, save=False, i=None):
+    xv, yv = torch.meshgrid([
+        torch.arange(-d, d, step),
+        torch.arange(-d, d, step)
+    ], indexing="ij")
+    pos_xy = torch.cat((xv.unsqueeze(-1), yv.unsqueeze(-1)), dim=-1)
+    p_xy = P.log_prob(pos_xy.to(device)).exp().unsqueeze(-1).cpu()
+
+    df = torch.cat([pos_xy, p_xy], dim=-1).numpy()
+    df = pd.DataFrame({
+        'x': df[:, :, 0].ravel(),
+        'y': df[:, :, 1].ravel(),
+        'p': df[:, :, 2].ravel(),
+    })
+
+    mu_hat = points.mean(0)
+    S = np.cov(points.T, bias=True)
+    mu = P.mean.numpy()
+    Sigma = P.variance.numpy()
+
+    mnorm = np.linalg.norm(mu_hat - mu)
+    Snorm = np.linalg.norm(np.diag(S) - Sigma)
+
+    fig1, ax2 = plt.subplots(constrained_layout=True)
+
+    ax2.scatter(X[:, 0], X[:, 1], c='r', s=5.)
+    ax2.set_aspect("equal")
+    plt.title(r'Iter %d; $\lVert \mu - \hat{\mu} \rVert = %.3f; \lVert \Sigma - S \rVert = %.3f$' % (i, mnorm, Snorm))
+    if save and i is not None:
+        plt.savefig("../img/%d.png" % i)
+        plt.close()
+    else:
+        plt.show()
+
+if __name__ == "__main__":
+    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+    
+    parser = argparse.ArgumentParser(
+        description="Run unadjusted Langevin algorithm on a 2D Gaussian mixture")
+    parser.add_argument("--stsize", type=float, default=.1, help="step size")
+    parser.add_argument("--nsteps", type=int, default=1001000,
+                        help="number of steps")
+    parser.add_argument("--burnin", type=int, default=1000,
+                        help="burn-in period")       
+    args = parser.parse_args()
+
+    step_size = args.stsize
+    number_of_steps = args.nsteps
+    burn_in = args.burnin
+    dimension = 2
+    
+    root_two_step_size = np.sqrt(2.0*step_size)
+
+    """
+    Set up initial measure and target
+    """
+    np.random.seed(1337)
+    torch.manual_seed(1337)
+
+    mix = torch.distributions.Categorical(torch.Tensor([.2, .5, .3]))
+    comp = torch.distributions.Independent(torch.distributions.Normal(3.*torch.randn(3,2), torch.Tensor([[1.5, .6], [1., 1.], [1.1, 1.6]])), 1)
+    measure = MixtureSameFamily(mix, comp, validate_args=False)
+
+    prior_mean = torch.Tensor([4.0] * dimension).to(device)
+    prior_cov = torch.Tensor([1.0] * dimension).diag().to(device)
+    prior = torch.distributions.MultivariateNormal(prior_mean, covariance_matrix=prior_cov)
+
+    tic = time.time()
+
+    current_point = prior.sample().to(device)
+    mu0 = np.zeros(dimension)
+    Sigma0 = np.eye(dimension)        
+    standard_normal = torch.distributions.MultivariateNormal(torch.Tensor(mu0).to(device),
+                                                       covariance_matrix=torch.Tensor(Sigma0).to(device))
+
+    points = torch.zeros(number_of_steps - burn_in, dimension)
+    points[0] = current_point
+    for i in range(number_of_steps):
+        X = current_point.detach().requires_grad_()
+        measure.log_prob(X).backward()
+        w = standard_normal.sample().to(device)
+        current_point = current_point + (X.grad)*step_size + root_two_step_size*w
+        if (i >= burn_in):
+            points[i-burn_in] = current_point
+
+    toc = time.time()
+    
+    points = np.array(points)
+    
+    print("Time: %.1f seconds" % (toc - tic))
+    
+    print("Number of samples: ", len(points))
+    print("")
+    print("Empircal mean vector")
+    print(points.mean(0))
+    print("")
+    print("Empircal covariance matrix")
+    print(np.cov(points.T, bias=True))
+    print("")
+    print("Actual mean vector")
+    print(np.array(measure.mean))
+    print("")
+    print("Actual variances")
+    print(np.array(measure.variance))
+    print("")
+    print("Difference in means")
+    print(np.linalg.norm(points.mean(0) - np.array(measure.mean)))
+    
+    #plot_output(measure, points, 15., .1, save=True, i=1000)
\ No newline at end of file
diff --git a/scripts/svgd.py b/scripts/svgd.py
new file mode 100644
index 0000000..f1628b7
--- /dev/null
+++ b/scripts/svgd.py
@@ -0,0 +1,57 @@
+# This code is taken from https://github.com/dilinwang820/Stein-Variational-Gradient-Descent/blob/master/python/svgd.py
+
+import numpy as np
+from scipy.spatial.distance import pdist, squareform
+
+class SVGD():
+
+    def __init__(self):
+        pass
+    
+    def svgd_kernel(self, theta, h = -1):
+        sq_dist = pdist(theta)
+        pairwise_dists = squareform(sq_dist)**2
+        if h < 0: # if h < 0, using median trick
+            h = np.median(pairwise_dists)  
+            h = np.sqrt(0.5 * h / np.log(theta.shape[0]+1))
+
+        # compute the rbf kernel
+        Kxy = np.exp( -pairwise_dists / h**2 / 2)
+
+        dxkxy = -np.matmul(Kxy, theta)
+        sumkxy = np.sum(Kxy, axis=1)
+        for i in range(theta.shape[1]):
+            dxkxy[:, i] = dxkxy[:,i] + np.multiply(theta[:,i],sumkxy)
+        dxkxy = dxkxy / (h**2)
+        return (Kxy, dxkxy)
+    
+ 
+    def update(self, x0, lnprob, n_iter = 1000, stepsize = 1e-3, bandwidth = -1, alpha = 0.9, debug = False):
+        # Check input
+        if x0 is None or lnprob is None:
+            raise ValueError('x0 or lnprob cannot be None!')
+        
+        theta = np.copy(x0) 
+        
+        # adagrad with momentum
+        fudge_factor = 1e-6
+        historical_grad = 0
+        for iter in range(n_iter):
+            if debug and (iter+1) % 1000 == 0:
+                print('iter ' + str(iter+1))
+            
+            lnpgrad = lnprob(theta)
+            # calculating the kernel matrix
+            kxy, dxkxy = self.svgd_kernel(theta, h = -1)  
+            grad_theta = (np.matmul(kxy, lnpgrad) + dxkxy) / x0.shape[0]  
+            
+            # adagrad 
+            if iter == 0:
+                historical_grad = historical_grad + grad_theta ** 2
+            else:
+                historical_grad = alpha * historical_grad + (1 - alpha) * (grad_theta ** 2)
+            adj_grad = np.divide(grad_theta, fudge_factor+np.sqrt(historical_grad))
+            theta = theta + stepsize * adj_grad 
+            
+        return theta
+