diff --git a/LICENSE b/LICENSE deleted file mode 100644 index 4a41e03..0000000 --- a/LICENSE +++ /dev/null @@ -1,21 +0,0 @@ -MIT License - -Copyright (c) 2022 James Foster - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in all -copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -SOFTWARE. diff --git a/README.md b/README.md new file mode 100644 index 0000000..0cc90ca --- /dev/null +++ b/README.md @@ -0,0 +1,71 @@ +# Markov Chain Cubature for Bayesian Inference + +This repository contains code for simulating Markov chains as a cloud of particles using cubature formulae. + +The `presentation` folder contains slides describing the methodology. + +The `scripts` folder contains the code. + +# Examples + +## 2D Gaussian Mixture + +This will run the cubature algorithm on a 2D Gaussian mixture: + + ```shell + python gmm_cubature.py + ``` + +Outputting something like + + ```shell + Time: 18.5 seconds +Number of particles: 1024 + +Cubature mean vector +[-4.13376 -1.8908577] + +Cubature covariance matrix +[[ 2.31224477 1.89332097] + [ 1.89332097 10.73603521]] + +Actual mean vector +[-4.148839 -1.8847313] + +Actual variances +[2.2467885 10.953863] + +Difference in means +0.016276047 +``` + +An illustration of the resulting particle configurations is given below: + +

+ + + +

+ +

+ + + +

+ + ## Bayesian Logistic Regression + +This runs the algorithm for a Bayesian logistic regression on a binary Covertype dataset (see [SVGD paper](https://proceedings.neurips.cc/paper/2016/file/b3ba8f1bee1238a2f37603d90b58898d-Paper.pdf) for details): + + ```shell + python bayesian_logisitc_regression_cubature.py + ``` + +Outputting something like + ```shell +Time: 1137.0 seconds +Number of particles: 256 +[accuracy, log-likelihood] +[0.7573040282953107, -0.5628809048706431] + ``` +The content of markov_chain_cubature_presentation.pdf is licensed under the Creative Commons Attribution 4.0 International license, and the Python code is licensed under the MIT license. \ No newline at end of file diff --git a/data/README.md b/data/README.md new file mode 100644 index 0000000..12540a8 --- /dev/null +++ b/data/README.md @@ -0,0 +1,3 @@ +We didn't contribute the data sets here. Please let us know if any conflicts of interest. + +The binary Covertype dataset is from [LIBSVM Data Repository](https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary.html) \ No newline at end of file diff --git a/data/covertype.mat b/data/covertype.mat new file mode 100644 index 0000000..af9bd30 Binary files /dev/null and b/data/covertype.mat differ diff --git a/img/0.png b/img/0.png new file mode 100644 index 0000000..fcf8c26 Binary files /dev/null and b/img/0.png differ diff --git a/img/100.png b/img/100.png new file mode 100644 index 0000000..dc3945c Binary files /dev/null and b/img/100.png differ diff --git a/img/1000.png b/img/1000.png new file mode 100644 index 0000000..89c2a5a Binary files /dev/null and b/img/1000.png differ diff --git a/img/25.png b/img/25.png new file mode 100644 index 0000000..5ccfd36 Binary files /dev/null and b/img/25.png differ diff --git a/img/250.png b/img/250.png new file mode 100644 index 0000000..8df4467 Binary files /dev/null and b/img/250.png differ diff --git a/img/500.png b/img/500.png new file mode 100644 index 0000000..683f099 Binary files /dev/null and b/img/500.png differ diff --git a/presentation/markov_chain_cubature_presentation.pdf b/presentation/markov_chain_cubature_presentation.pdf new file mode 100644 index 0000000..a632fd5 Binary files /dev/null and b/presentation/markov_chain_cubature_presentation.pdf differ diff --git a/scripts/bayesian_logisitc_regression_cubature.py b/scripts/bayesian_logisitc_regression_cubature.py new file mode 100644 index 0000000..4f477d2 --- /dev/null +++ b/scripts/bayesian_logisitc_regression_cubature.py @@ -0,0 +1,165 @@ +# This code is adapted from https://github.com/dilinwang820/Stein-Variational-Gradient-Descent/blob/master/python/bayesian_logistic_regression.py + +import numpy as np +import scipy.io +from sklearn.model_selection import train_test_split +import numpy.matlib as nm +import torch +from scipy.linalg import hadamard +from sklearn.neighbors import BallTree +import time + +''' + Example of Bayesian Logistic Regression (the same setting as Gershman et al. 2012): + The observed data D = {X, y} consist of N binary class labels, + y_t \in {-1,+1}, and d covariates for each datapoint, X_t \in R^d. + The hidden variables \theta = {w, \alpha} consist of d regression coefficients w_k \in R, + and a precision parameter \alpha \in R_+. We assume the following model: + p(\alpha) = Gamma(\alpha; a, b) + p(w_k | a) = N(w_k; 0, \alpha^-1) + p(y_t = 1| x_t, w) = 1 / (1+exp(-w^T x_t)) +''' +class BayesianLR: + def __init__(self, X, Y, batchsize=100, a0=1, b0=0.01): + self.X, self.Y = X, Y + # TODO. Y in \in{+1, -1} + self.batchsize = min(batchsize, X.shape[0]) + self.a0, self.b0 = a0, b0 + + self.N = X.shape[0] + self.permutation = np.random.permutation(self.N) + self.iter = 0 + + + def dlnprob(self, theta): + + if self.batchsize > 0: + batch = [ i % self.N for i in range(self.iter * self.batchsize, (self.iter + 1) * self.batchsize) ] + ridx = self.permutation[batch] + self.iter += 1 + else: + ridx = np.random.permutation(self.X.shape[0]) + + Xs = self.X[ridx, :] + Ys = self.Y[ridx] + + w = theta[:, :-1] # logistic weights + alpha = np.exp(theta[:, -1]) # the last column is logalpha + d = w.shape[1] + + wt = np.multiply((alpha / 2), np.sum(w ** 2, axis=1)) + + coff = np.matmul(Xs, w.T) + y_hat = 1.0 / (1.0 + np.exp(-1 * coff)) + + dw_data = np.matmul(((nm.repmat(np.vstack(Ys), 1, theta.shape[0]) + 1) / 2.0 - y_hat).T, Xs) # Y \in {-1,1} + dw_prior = -np.multiply(nm.repmat(np.vstack(alpha), 1, d) , w) + dw = dw_data * 1.0 * self.X.shape[0] / Xs.shape[0] + dw_prior # re-scale + + dalpha = d / 2.0 - wt + (self.a0 - 1) - self.b0 * alpha + 1 # the last term is the jacobian term + + return torch.Tensor(np.hstack([dw, np.vstack(dalpha)])) # % first order derivative + + def evaluation(self, theta, X_test, y_test): + theta = theta[:, :-1] + M, n_test = theta.shape[0], len(y_test) + + prob = np.zeros([n_test, M]) + for t in range(M): + coff = np.multiply(y_test, np.sum(-1 * np.multiply(nm.repmat(theta[t, :], n_test, 1), X_test), axis=1)) + prob[:, t] = np.divide(np.ones(n_test), (1 + np.exp(coff))) + + prob = np.mean(prob, axis=1) + acc = np.mean(prob > 0.5) + llh = np.mean(np.log(prob)) + return [acc, llh] + + +def propagate_measure(measure, gradlnprob, local_cubature, step_size, root_step_size): + n = measure.size(0) + new_measure = root_step_size*torch.Tensor(local_cubature).repeat(n, 1) + gradients = gradlnprob(np.array(measure)) + + # THεO POULA adjustment + abs_gradient = torch.abs(gradients) + + gradients = torch.div(gradients, 1.0 + root_step_size*abs_gradient) + gradients = gradients + torch.div(root_step_size*gradients, 1.0 + abs_gradient) + + next_points = (measure + gradients*step_size).repeat_interleave(len(local_cubature), dim=0) + new_measure += next_points + + return new_measure + +def produce_local_cubature(dimension): + max_power = np.ceil(np.log2(dimension)) + max_dim = int(2**max_power) + hadamard_matrix = hadamard(max_dim) + if (dimension == max_dim): + return np.sqrt(2.0)*np.vstack((hadamard_matrix, -hadamard_matrix)) + else: + new_matrix = hadamard_matrix[:, :dimension] + return np.sqrt(2.0)*np.vstack((new_matrix, -new_matrix)) + + +if __name__ == '__main__': + data = scipy.io.loadmat('../data/covertype.mat') + + X_input = data['covtype'][:, 1:] + y_input = data['covtype'][:, 0] + y_input[y_input == 2] = -1 + + N = X_input.shape[0] + X_input = np.hstack([X_input, np.ones([N, 1])]) + d = X_input.shape[1] + dimension = d + 1 + + # split the dataset into training and testing + X_train, X_test, y_train, y_test = train_test_split(X_input, y_input, test_size=0.2, random_state=42) + + a0, b0 = 1, 0.01 #hyperparameters + model = BayesianLR(X_train, y_train, 100, a0, b0) # batchsize = 100 + + # initialization + number_of_points = 1024 + cubature_measure = np.zeros([number_of_points, dimension]); + alpha0 = np.random.gamma(a0, b0, number_of_points); + + for i in range(number_of_points): + cubature_measure[i, :] = np.hstack([np.random.normal(0, np.sqrt(1 / alpha0[i]), d), np.log(alpha0[i])]) + + cubature_measure = torch.Tensor(cubature_measure) + local_cubature = produce_local_cubature(dimension) + + points_per_recombine = 128 + number_of_halving = 4 + number_of_steps = 1500 + step_size = 0.01 + root_step_size = np.sqrt(step_size) + + tic = time.time() + + for i in range(number_of_steps): + # Propagate cubature measure + big_measure = propagate_measure(cubature_measure, model.dlnprob, local_cubature, step_size, root_step_size) + + # Partition particles into subsets using a ball tree + ball_tree = BallTree(big_measure, leaf_size=0.5*points_per_recombine) + _, indices, nodes, _ = ball_tree.get_arrays() + + # Resample particles + new_indices = [np.random.choice(indices[nd[0]: nd[1]]) for nd in nodes if nd[2]] + cubature_measure = torch.Tensor(big_measure[new_indices]) + + if (i+1) % 100 == 0: + print('iter ' + str(i+1)) + + toc = time.time() + + print("Time: %.1f seconds" % (toc - tic)) + + print("Number of particles: ", len(cubature_measure)) + + print("[accuracy, log-likelihood]") + print(model.evaluation(cubature_measure, X_test, y_test)) + \ No newline at end of file diff --git a/scripts/bayesian_logistic_regression_svgd.py b/scripts/bayesian_logistic_regression_svgd.py new file mode 100644 index 0000000..042044f --- /dev/null +++ b/scripts/bayesian_logistic_regression_svgd.py @@ -0,0 +1,109 @@ +# This code is taken from https://github.com/dilinwang820/Stein-Variational-Gradient-Descent/blob/master/python/bayesian_logistic_regression.py + +import numpy as np +import scipy.io +from sklearn.model_selection import train_test_split +import numpy.matlib as nm +from svgd import SVGD + +import time + +''' + Example of Bayesian Logistic Regression (the same setting as Gershman et al. 2012): + The observed data D = {X, y} consist of N binary class labels, + y_t \in {-1,+1}, and d covariates for each datapoint, X_t \in R^d. + The hidden variables \theta = {w, \alpha} consist of d regression coefficients w_k \in R, + and a precision parameter \alpha \in R_+. We assume the following model: + p(\alpha) = Gamma(\alpha; a, b) + p(w_k | a) = N(w_k; 0, \alpha^-1) + p(y_t = 1| x_t, w) = 1 / (1+exp(-w^T x_t)) +''' +class BayesianLR: + def __init__(self, X, Y, batchsize=100, a0=1, b0=0.01): + self.X, self.Y = X, Y + # TODO. Y in \in{+1, -1} + self.batchsize = min(batchsize, X.shape[0]) + self.a0, self.b0 = a0, b0 + + self.N = X.shape[0] + self.permutation = np.random.permutation(self.N) + self.iter = 0 + + + def dlnprob(self, theta): + + if self.batchsize > 0: + batch = [ i % self.N for i in range(self.iter * self.batchsize, (self.iter + 1) * self.batchsize) ] + ridx = self.permutation[batch] + self.iter += 1 + else: + ridx = np.random.permutation(self.X.shape[0]) + + Xs = self.X[ridx, :] + Ys = self.Y[ridx] + + w = theta[:, :-1] # logistic weights + alpha = np.exp(theta[:, -1]) # the last column is logalpha + d = w.shape[1] + + wt = np.multiply((alpha / 2), np.sum(w ** 2, axis=1)) + + coff = np.matmul(Xs, w.T) + y_hat = 1.0 / (1.0 + np.exp(-1 * coff)) + + dw_data = np.matmul(((nm.repmat(np.vstack(Ys), 1, theta.shape[0]) + 1) / 2.0 - y_hat).T, Xs) # Y \in {-1,1} + dw_prior = -np.multiply(nm.repmat(np.vstack(alpha), 1, d) , w) + dw = dw_data * 1.0 * self.X.shape[0] / Xs.shape[0] + dw_prior # re-scale + + dalpha = d / 2.0 - wt + (self.a0 - 1) - self.b0 * alpha + 1 # the last term is the jacobian term + + return np.hstack([dw, np.vstack(dalpha)]) # % first order derivative + + def evaluation(self, theta, X_test, y_test): + theta = theta[:, :-1] + M, n_test = theta.shape[0], len(y_test) + + prob = np.zeros([n_test, M]) + for t in range(M): + coff = np.multiply(y_test, np.sum(-1 * np.multiply(nm.repmat(theta[t, :], n_test, 1), X_test), axis=1)) + prob[:, t] = np.divide(np.ones(n_test), (1 + np.exp(coff))) + + prob = np.mean(prob, axis=1) + acc = np.mean(prob > 0.5) + llh = np.mean(np.log(prob)) + return [acc, llh] + +if __name__ == '__main__': + data = scipy.io.loadmat('../data/covertype.mat') + + X_input = data['covtype'][:, 1:] + y_input = data['covtype'][:, 0] + y_input[y_input == 2] = -1 + + N = X_input.shape[0] + X_input = np.hstack([X_input, np.ones([N, 1])]) + d = X_input.shape[1] + D = d + 1 + + # split the dataset into training and testing + X_train, X_test, y_train, y_test = train_test_split(X_input, y_input, test_size=0.2, random_state=42) + + a0, b0 = 1, 0.01 #hyper-parameters + model = BayesianLR(X_train, y_train, 100, a0, b0) # batchsize = 100 + + # initialization + M = 256 # number of particles + theta0 = np.zeros([M, D]); + alpha0 = np.random.gamma(a0, b0, M); + for i in range(M): + theta0[i, :] = np.hstack([np.random.normal(0, np.sqrt(1 / alpha0[i]), d), np.log(alpha0[i])]) + + tic = time.time() + theta = SVGD().update(x0=theta0, lnprob=model.dlnprob, bandwidth=-1, n_iter=6000, stepsize=0.05, alpha=0.9, debug=True) + + toc = time.time() + + print("Time: %.1f seconds" % (toc - tic)) + + print('[accuracy, log-likelihood]') + print(model.evaluation(theta, X_test, y_test)) diff --git a/scripts/gmm.py b/scripts/gmm.py new file mode 100644 index 0000000..c409964 --- /dev/null +++ b/scripts/gmm.py @@ -0,0 +1,148 @@ +import torch +from torch import distributions as D +from torch.nn import functional as F + + +class MixtureSameFamily(D.Distribution): + """ Mixture (same-family) distribution. + + The `MixtureSameFamily` distribution implements a (batch of) mixture + distribution where all components are from different parameterizations of + the same distribution type. It is parameterized by a `Categorical` + "selecting distribution" (over `k` components) and a components + distribution, i.e., a `Distribution` with a rightmost batch shape + (equal to `[k]`) which indexes each (batch of) component. + """ + + def __init__(self, + mixture_distribution, + components_distribution, + validate_args=None): + """ Construct a 'MixtureSameFamily' distribution + + Args:: + mixture_distribution: `torch.distributions.Categorical`-like + instance. Manages the probability of selecting components. + The number of categories must match the rightmost batch + dimension of the `components_distribution`. Must have either + scalar `batch_shape` or `batch_shape` matching + `components_distribution.batch_shape[:-1]` + components_distribution: `torch.distributions.Distribution`-like + instance. Right-most batch dimension indexes components. + + Examples:: + # Construct Gaussian Mixture Model in 1D consisting of 5 equally + # weighted normal distributions + >>> mix = D.Categorical(torch.ones(5,)) + >>> comp = D.Normal(torch.randn(5,), torch.rand(5,)) + >>> gmm = MixtureSameFamily(mix, comp) + + # Construct Gaussian Mixture Modle in 2D consisting of 5 equally + # weighted bivariate normal distributions + >>> mix = D.Categorical(torch.ones(5,)) + >>> comp = D.Independent(D.Normal( + torch.randn(5,2), torch.rand(5,2)), 1) + >>> gmm = MixtureSameFamily(mix, comp) + + # Construct a batch of 3 Gaussian Mixture Models in 2D each + # consisting of 5 random weighted bivariate normal distributions + >>> mix = D.Categorical(torch.rand(3,5)) + >>> comp = D.Independent(D.Normal( + torch.randn(3,5,2), torch.rand(3,5,2)), 1) + >>> gmm = MixtureSameFamily(mix, comp) + + """ + self._mixture_distribution = mixture_distribution + self._components_distribution = components_distribution + + if not isinstance(self._mixture_distribution, D.Categorical): + raise ValueError(" The Mixture distribution needs to be an " + " instance of torch.distribtutions.Categorical") + + if not isinstance(self._components_distribution, D.Distribution): + raise ValueError("The Component distribution need to be an " + "instance of torch.distributions.Distribution") + + # Check that batch size matches + mdbs = self._mixture_distribution.batch_shape + cdbs = self._components_distribution.batch_shape[:-1] + if len(mdbs) != 0 and mdbs != cdbs: + raise ValueError("`mixture_distribution.batch_shape` ({0}) is not " + "compatible with `components_distribution." + "batch_shape`({1})".format(mdbs, cdbs)) + + # Check that the number of mixture components matches + km = self._mixture_distribution.logits.shape[-1] + kc = self._components_distribution.batch_shape[-1] + if km is not None and kc is not None and km != kc: + raise ValueError("`mixture_distribution components` ({0}) does not" + " equal `components_distribution.batch_shape[-1]`" + " ({1})".format(km, kc)) + self._num_components = km + + event_shape = self._components_distribution.event_shape + self._event_ndims = len(event_shape) + super(MixtureSameFamily, self).__init__(batch_shape=cdbs, + event_shape=event_shape, + validate_args=validate_args) + + @property + def mixture_distribution(self): + return self._mixture_distribution + + @property + def components_distribution(self): + return self._components_distribution + + @property + def mean(self): + probs = self._pad_mixture_dimensions(self.mixture_distribution.probs) + return torch.sum(probs * self.components_distribution.mean, + dim=-1-self._event_ndims) # [B, E] + + @property + def variance(self): + # Law of total variance: Var(Y) = E[Var(Y|X)] + Var(E[Y|X]) + probs = self._pad_mixture_dimensions(self.mixture_distribution.probs) + mean_cond_var = torch.sum(probs*self.components_distribution.variance, + dim=-1-self._event_ndims) + var_cond_mean = torch.sum(probs * (self.components_distribution.mean - + self._pad(self.mean)).pow(2.0), + dim=-1-self._event_ndims) + return mean_cond_var + var_cond_mean + + def log_prob(self, x): + x = self._pad(x) + log_prob_x = self.components_distribution.log_prob(x) # [S, B, k] + log_mix_prob = torch.log_softmax(self.mixture_distribution.logits, + dim=-1) # [B, k] + return torch.logsumexp(log_prob_x + log_mix_prob, dim=-1) # [S, B] + + def sample(self, sample_shape=torch.Size()): + with torch.no_grad(): + # [n, B] + mix_sample = self.mixture_distribution.sample(sample_shape) + # [n, B, k, E] + comp_sample = self.components_distribution.sample(sample_shape) + # [n, B, k] + mask = F.one_hot(mix_sample, self._num_components) + # [n, B, k, [1]*E] + mask = self._pad_mixture_dimensions(mask) + return torch.sum(comp_sample * mask.float(), + dim=-1-self._event_ndims) + + def _pad(self, x): + d = len(x.shape) - self._event_ndims + s = x.shape + x = x.reshape(*s[:d], 1, *s[d:]) + return x + + def _pad_mixture_dimensions(self, x): + dist_batch_ndims = self.batch_shape.numel() + cat_batch_ndims = self.mixture_distribution.batch_shape.numel() + pad_ndims = 0 if cat_batch_ndims == 1 else \ + dist_batch_ndims - cat_batch_ndims + s = x.shape + x = torch.reshape(x, shape=(*s[:-1], *(pad_ndims*[1]), + *s[-1:], *(self._event_ndims*[1]))) + return x \ No newline at end of file diff --git a/scripts/gmm_cubature.py b/scripts/gmm_cubature.py new file mode 100644 index 0000000..70271a1 --- /dev/null +++ b/scripts/gmm_cubature.py @@ -0,0 +1,160 @@ +import numpy as np +import pandas as pd +import matplotlib.pyplot as plt +plt.rc('text', usetex=True) +plt.rc('text.latex', preamble=r'\usepackage{amsmath}') +from sklearn.neighbors import BallTree +from gmm import MixtureSameFamily +import torch +import torch.autograd as autograd +from scipy.linalg import hadamard +import argparse +import time + +def produce_local_cubature(dimension): + max_power = np.ceil(np.log2(dimension)) + max_dim = int(2**max_power) + hadamard_matrix = hadamard(max_dim) + if (dimension == max_dim): + return np.vstack((hadamard_matrix, -hadamard_matrix)) + else: + new_matrix = hadamard_matrix[:, :dimension] + return np.vstack((new_matrix, -new_matrix)) + +def grad_log_prob(P, X): + X = X.detach().requires_grad_(True) + + log_prob = P.log_prob(X) + + return autograd.grad(log_prob.sum(), X)[0] + +def propagate_measure(measure, target_dist, local_cubature, step_size, root_two_step_size): + n = measure.size(0) + new_measure = root_two_step_size*torch.Tensor(local_cubature).repeat(n, 1) + X = measure.detach().requires_grad_() + target_dist.log_prob(X).backward(torch.ones(n)) + grads = X.grad + next_points = (measure + grads*step_size).repeat_interleave(len(local_cubature), dim=0) + new_measure += next_points + return new_measure + +def plot_output(P, X, d=7.0, step=0.1, save=False, i=None): + xv, yv = torch.meshgrid([ + torch.arange(-(d+2.0), (d-2.0), step), + torch.arange(-(d+2.0), d, step) + ], indexing="ij") + pos_xy = torch.cat((xv.unsqueeze(-1), yv.unsqueeze(-1)), dim=-1) + p_xy = P.log_prob(pos_xy.to(device)).exp().unsqueeze(-1).cpu() + + df = torch.cat([pos_xy, p_xy], dim=-1).numpy() + df = pd.DataFrame({ + 'x': df[:, :, 0].ravel(), + 'y': df[:, :, 1].ravel(), + 'p': df[:, :, 2].ravel(), + }) + + mu_hat = cubature_measure.mean(0) + S = np.cov(cubature_measure.T, bias=True) + mu = P.mean.numpy() + Sigma = P.variance.numpy() + + mnorm = np.linalg.norm(mu_hat - mu) + Snorm = np.linalg.norm(np.diag(S) - Sigma) + + fig1, ax2 = plt.subplots(constrained_layout=True) + ax2.scatter(X[:, 0], X[:, 1], c='r', s=5.) + ax2.set_aspect("equal") + plt.title(r'Iter %d; $\lVert \mu - \widehat{\mu} \rVert = %.3f; \lVert \Sigma - \widehat{\Sigma} \rVert = %.3f$' % (i, mnorm, Snorm)) + if save and i is not None: + plt.savefig("../img/%d.png" % i) + plt.close() + else: + plt.show() + + +if __name__ == "__main__": + device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') + parser = argparse.ArgumentParser( + description="Run cubature-based Langevin algorithm on a 2D Gaussian mixture model") + parser.add_argument("--stsize", type=float, default=.1, help="step size") + parser.add_argument("--nsteps", type=int, default=1000, + help="number of steps") + parser.add_argument("--npoints", type=int, default=1000, + help="number of points (power of two)") + args = parser.parse_args() + + step_size = args.stsize + dimension = 2 + number_of_steps = args.nsteps + number_of_points = args.npoints + no_of_recombines = number_of_points + number_of_halving = int(np.log2(dimension)) + 1 + log_no_of_recombines = int(np.log2(no_of_recombines)) + no_of_recombines = 2**log_no_of_recombines + + if dimension != int(2**(number_of_halving - 1)): + number_of_halving = int(np.log2(dimension)) + 2 + + root_two_step_size = np.sqrt(2.0*step_size) + points_per_recombine = (number_of_points * pow(2, number_of_halving))/no_of_recombines + + """ + Set up initial measure and target + """ + np.random.seed(1337) + torch.manual_seed(1337) + + mix = torch.distributions.Categorical(torch.Tensor([.2, .5, .3])) + comp = torch.distributions.Independent(torch.distributions.Normal(3.*torch.randn(3,2), torch.Tensor([[1.5, .6], [1., 1.], [1.1, 1.6]])), 1) + measure = MixtureSameFamily(mix, comp, validate_args=False) + + prior_mean = torch.Tensor([4.0] * dimension).to(device) + prior_cov = torch.Tensor([1.0] * dimension).diag().to(device) + prior = torch.distributions.MultivariateNormal( + prior_mean, covariance_matrix=prior_cov) + + prior_samples = prior.sample(torch.Size([number_of_points])).to(device) + + cubature_measure = prior_samples.clone() + + local_cubature = produce_local_cubature(dimension) + + tic = time.time() + for i in range(number_of_steps): + #if i % 25 == 0: + # plot_output(measure, cubature_measure, d=10., step=.1, save=True, i=i) + + # Propagate cubature measure + big_measure = propagate_measure(cubature_measure, measure, local_cubature, step_size, root_two_step_size) + + # Partition particles into subsets using a ball tree + ball_tree = BallTree(big_measure, leaf_size=0.5*points_per_recombine) + _, indices, nodes, _ = ball_tree.get_arrays() + + # Resample particles + new_indices = [np.random.choice(indices[nd[0]: nd[1]]) for nd in nodes if nd[2]] + + cubature_measure = torch.Tensor(big_measure[new_indices]) + + toc = time.time() + print("Time: %.1f seconds" % (toc - tic)) + #plot_output(measure, cubature_measure, 10.0, .1, save=True, i=1000) + + cubature_measure = np.array(cubature_measure) + + print("Number of particles: ", len(cubature_measure)) + print("") + print("Cubature mean vector") + print(cubature_measure.mean(0)) + print("") + print("Cubature covariance matrix") + print(np.cov(cubature_measure.T, bias=True)) + print("") + print("Actual mean vector") + print(np.array(measure.mean)) + print("") + print("Actual variances") + print(np.array(measure.variance)) + print("") + print("Difference in means") + print(np.linalg.norm(cubature_measure.mean(0) - np.array(measure.mean))) \ No newline at end of file diff --git a/scripts/gmm_mcmc.py b/scripts/gmm_mcmc.py new file mode 100644 index 0000000..d33c00b --- /dev/null +++ b/scripts/gmm_mcmc.py @@ -0,0 +1,120 @@ +import matplotlib.pyplot as plt +plt.rc('text', usetex=True) +plt.rc('text.latex', preamble=r'\usepackage{amsmath}') + +from gmm import MixtureSameFamily +import torch +import numpy as np +import time +import pandas as pd +import argparse + +def plot_output(P, X, d=7.0, step=0.1, save=False, i=None): + xv, yv = torch.meshgrid([ + torch.arange(-d, d, step), + torch.arange(-d, d, step) + ], indexing="ij") + pos_xy = torch.cat((xv.unsqueeze(-1), yv.unsqueeze(-1)), dim=-1) + p_xy = P.log_prob(pos_xy.to(device)).exp().unsqueeze(-1).cpu() + + df = torch.cat([pos_xy, p_xy], dim=-1).numpy() + df = pd.DataFrame({ + 'x': df[:, :, 0].ravel(), + 'y': df[:, :, 1].ravel(), + 'p': df[:, :, 2].ravel(), + }) + + mu_hat = points.mean(0) + S = np.cov(points.T, bias=True) + mu = P.mean.numpy() + Sigma = P.variance.numpy() + + mnorm = np.linalg.norm(mu_hat - mu) + Snorm = np.linalg.norm(np.diag(S) - Sigma) + + fig1, ax2 = plt.subplots(constrained_layout=True) + + ax2.scatter(X[:, 0], X[:, 1], c='r', s=5.) + ax2.set_aspect("equal") + plt.title(r'Iter %d; $\lVert \mu - \hat{\mu} \rVert = %.3f; \lVert \Sigma - S \rVert = %.3f$' % (i, mnorm, Snorm)) + if save and i is not None: + plt.savefig("../img/%d.png" % i) + plt.close() + else: + plt.show() + +if __name__ == "__main__": + device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') + + parser = argparse.ArgumentParser( + description="Run unadjusted Langevin algorithm on a 2D Gaussian mixture") + parser.add_argument("--stsize", type=float, default=.1, help="step size") + parser.add_argument("--nsteps", type=int, default=1001000, + help="number of steps") + parser.add_argument("--burnin", type=int, default=1000, + help="burn-in period") + args = parser.parse_args() + + step_size = args.stsize + number_of_steps = args.nsteps + burn_in = args.burnin + dimension = 2 + + root_two_step_size = np.sqrt(2.0*step_size) + + """ + Set up initial measure and target + """ + np.random.seed(1337) + torch.manual_seed(1337) + + mix = torch.distributions.Categorical(torch.Tensor([.2, .5, .3])) + comp = torch.distributions.Independent(torch.distributions.Normal(3.*torch.randn(3,2), torch.Tensor([[1.5, .6], [1., 1.], [1.1, 1.6]])), 1) + measure = MixtureSameFamily(mix, comp, validate_args=False) + + prior_mean = torch.Tensor([4.0] * dimension).to(device) + prior_cov = torch.Tensor([1.0] * dimension).diag().to(device) + prior = torch.distributions.MultivariateNormal(prior_mean, covariance_matrix=prior_cov) + + tic = time.time() + + current_point = prior.sample().to(device) + mu0 = np.zeros(dimension) + Sigma0 = np.eye(dimension) + standard_normal = torch.distributions.MultivariateNormal(torch.Tensor(mu0).to(device), + covariance_matrix=torch.Tensor(Sigma0).to(device)) + + points = torch.zeros(number_of_steps - burn_in, dimension) + points[0] = current_point + for i in range(number_of_steps): + X = current_point.detach().requires_grad_() + measure.log_prob(X).backward() + w = standard_normal.sample().to(device) + current_point = current_point + (X.grad)*step_size + root_two_step_size*w + if (i >= burn_in): + points[i-burn_in] = current_point + + toc = time.time() + + points = np.array(points) + + print("Time: %.1f seconds" % (toc - tic)) + + print("Number of samples: ", len(points)) + print("") + print("Empircal mean vector") + print(points.mean(0)) + print("") + print("Empircal covariance matrix") + print(np.cov(points.T, bias=True)) + print("") + print("Actual mean vector") + print(np.array(measure.mean)) + print("") + print("Actual variances") + print(np.array(measure.variance)) + print("") + print("Difference in means") + print(np.linalg.norm(points.mean(0) - np.array(measure.mean))) + + #plot_output(measure, points, 15., .1, save=True, i=1000) \ No newline at end of file diff --git a/scripts/svgd.py b/scripts/svgd.py new file mode 100644 index 0000000..f1628b7 --- /dev/null +++ b/scripts/svgd.py @@ -0,0 +1,57 @@ +# This code is taken from https://github.com/dilinwang820/Stein-Variational-Gradient-Descent/blob/master/python/svgd.py + +import numpy as np +from scipy.spatial.distance import pdist, squareform + +class SVGD(): + + def __init__(self): + pass + + def svgd_kernel(self, theta, h = -1): + sq_dist = pdist(theta) + pairwise_dists = squareform(sq_dist)**2 + if h < 0: # if h < 0, using median trick + h = np.median(pairwise_dists) + h = np.sqrt(0.5 * h / np.log(theta.shape[0]+1)) + + # compute the rbf kernel + Kxy = np.exp( -pairwise_dists / h**2 / 2) + + dxkxy = -np.matmul(Kxy, theta) + sumkxy = np.sum(Kxy, axis=1) + for i in range(theta.shape[1]): + dxkxy[:, i] = dxkxy[:,i] + np.multiply(theta[:,i],sumkxy) + dxkxy = dxkxy / (h**2) + return (Kxy, dxkxy) + + + def update(self, x0, lnprob, n_iter = 1000, stepsize = 1e-3, bandwidth = -1, alpha = 0.9, debug = False): + # Check input + if x0 is None or lnprob is None: + raise ValueError('x0 or lnprob cannot be None!') + + theta = np.copy(x0) + + # adagrad with momentum + fudge_factor = 1e-6 + historical_grad = 0 + for iter in range(n_iter): + if debug and (iter+1) % 1000 == 0: + print('iter ' + str(iter+1)) + + lnpgrad = lnprob(theta) + # calculating the kernel matrix + kxy, dxkxy = self.svgd_kernel(theta, h = -1) + grad_theta = (np.matmul(kxy, lnpgrad) + dxkxy) / x0.shape[0] + + # adagrad + if iter == 0: + historical_grad = historical_grad + grad_theta ** 2 + else: + historical_grad = alpha * historical_grad + (1 - alpha) * (grad_theta ** 2) + adj_grad = np.divide(grad_theta, fudge_factor+np.sqrt(historical_grad)) + theta = theta + stepsize * adj_grad + + return theta +