Initial commit

james-m-foster · Dec 15, 2022 · 64e8f4a · 64e8f4a
1 parent bb36415
commit 64e8f4a
Show file tree

Hide file tree

Showing 17 changed files with 833 additions and 21 deletions.
diff --git a/LICENSE b/LICENSE
diff --git a/README.md b/README.md
@@ -0,0 +1,71 @@
+# Markov Chain Cubature for Bayesian Inference
+
+This repository contains code for simulating Markov chains as a cloud of particles using cubature formulae.
+
+The `presentation` folder contains slides describing the methodology.
+
+The `scripts` folder contains the code.
+
+# Examples
+
+## 2D Gaussian Mixture
+
+This will run the cubature algorithm on a 2D Gaussian mixture:
+
+ ```shell
+  python gmm_cubature.py
+ ``` 
+
+Outputting something like
+
+ ```shell
+ Time: 18.5 seconds
+Number of particles:  1024
+
+Cubature mean vector
+[-4.13376   -1.8908577]
+
+Cubature covariance matrix
+[[ 2.31224477  1.89332097]
+ [ 1.89332097 10.73603521]]
+
+Actual mean vector
+[-4.148839  -1.8847313]
+
+Actual variances
+[2.2467885 10.953863]
+
+Difference in means
+0.016276047
+```
+
+An illustration of the resulting particle configurations is given below:
+
+<p align="middle">
+    <img src="./img/0.png" width=32%>
+    <img src="./img/25.png" width=32%>
+    <img src="./img/100.png" width=32%>
+</p>
+
+<p align="middle">
+    <img src="./img/250.png" width=32%>
+    <img src="./img/500.png" width=32%>
+    <img src="./img/1000.png" width=32%>
+</p>
+
+ ## Bayesian Logistic Regression	
+
+This runs the algorithm for a Bayesian logistic regression on a binary Covertype dataset (see [SVGD paper](https://proceedings.neurips.cc/paper/2016/file/b3ba8f1bee1238a2f37603d90b58898d-Paper.pdf) for details):
+
+ ```shell
+ python bayesian_logisitc_regression_cubature.py
+ ```
+
+Outputting something like
+ ```shell
+Time: 1137.0 seconds
+Number of particles:  256
+[accuracy, log-likelihood]
+[0.7573040282953107, -0.5628809048706431]
+ ```
+The content of markov_chain_cubature_presentation.pdf is licensed under the Creative Commons Attribution 4.0 International license, and the Python code is licensed under the MIT license.
diff --git a/data/README.md b/data/README.md
@@ -0,0 +1,3 @@
+We didn't contribute the data sets here. Please let us know if any conflicts of interest. 
+
+The binary Covertype dataset is from [LIBSVM Data Repository](https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary.html) 
diff --git a/data/covertype.mat b/data/covertype.mat
diff --git a/img/0.png b/img/0.png
diff --git a/img/100.png b/img/100.png
diff --git a/img/1000.png b/img/1000.png
diff --git a/img/25.png b/img/25.png
diff --git a/img/250.png b/img/250.png
diff --git a/img/500.png b/img/500.png
diff --git a/presentation/markov_chain_cubature_presentation.pdf b/presentation/markov_chain_cubature_presentation.pdf
diff --git a/scripts/bayesian_logisitc_regression_cubature.py b/scripts/bayesian_logisitc_regression_cubature.py
@@ -0,0 +1,165 @@
+# This code is adapted from https://github.com/dilinwang820/Stein-Variational-Gradient-Descent/blob/master/python/bayesian_logistic_regression.py
+
+import numpy as np
+import scipy.io
+from sklearn.model_selection import train_test_split
+import numpy.matlib as nm
+import torch
+from scipy.linalg import hadamard
+from sklearn.neighbors import BallTree
+import time
+
+'''
+    Example of Bayesian Logistic Regression (the same setting as Gershman et al. 2012):
+    The observed data D = {X, y} consist of N binary class labels, 
+    y_t \in {-1,+1}, and d covariates for each datapoint, X_t \in R^d.
+    The hidden variables \theta = {w, \alpha} consist of d regression coefficients w_k \in R,
+    and a precision parameter \alpha \in R_+. We assume the following model:
+        p(\alpha) = Gamma(\alpha; a, b)
+        p(w_k | a) = N(w_k; 0, \alpha^-1)
+        p(y_t = 1| x_t, w) = 1 / (1+exp(-w^T x_t))
+'''
+class BayesianLR:
+    def __init__(self, X, Y, batchsize=100, a0=1, b0=0.01):
+        self.X, self.Y = X, Y
+        # TODO. Y in \in{+1, -1}
+        self.batchsize = min(batchsize, X.shape[0])
+        self.a0, self.b0 = a0, b0
+
+        self.N = X.shape[0]
+        self.permutation = np.random.permutation(self.N)
+        self.iter = 0
+
+
+    def dlnprob(self, theta):
+
+        if self.batchsize > 0:
+            batch = [ i % self.N for i in range(self.iter * self.batchsize, (self.iter + 1) * self.batchsize) ]
+            ridx = self.permutation[batch]
+            self.iter += 1
+        else:
+            ridx = np.random.permutation(self.X.shape[0])
+
+        Xs = self.X[ridx, :]
+        Ys = self.Y[ridx]
+
+        w = theta[:, :-1]  # logistic weights
+        alpha = np.exp(theta[:, -1])  # the last column is logalpha
+        d = w.shape[1]
+
+        wt = np.multiply((alpha / 2), np.sum(w ** 2, axis=1))
+
+        coff = np.matmul(Xs, w.T)
+        y_hat = 1.0 / (1.0 + np.exp(-1 * coff))
+
+        dw_data = np.matmul(((nm.repmat(np.vstack(Ys), 1, theta.shape[0]) + 1) / 2.0 - y_hat).T, Xs)  # Y \in {-1,1}
+        dw_prior = -np.multiply(nm.repmat(np.vstack(alpha), 1, d) , w)
+        dw = dw_data * 1.0 * self.X.shape[0] / Xs.shape[0] + dw_prior  # re-scale
+
+        dalpha = d / 2.0 - wt + (self.a0 - 1) - self.b0 * alpha + 1  # the last term is the jacobian term
+
+        return torch.Tensor(np.hstack([dw, np.vstack(dalpha)]))  # % first order derivative 
+
+    def evaluation(self, theta, X_test, y_test):
+        theta = theta[:, :-1]
+        M, n_test = theta.shape[0], len(y_test)
+
+        prob = np.zeros([n_test, M])
+        for t in range(M):
+            coff = np.multiply(y_test, np.sum(-1 * np.multiply(nm.repmat(theta[t, :], n_test, 1), X_test), axis=1))
+            prob[:, t] = np.divide(np.ones(n_test), (1 + np.exp(coff)))
+
+        prob = np.mean(prob, axis=1)
+        acc = np.mean(prob > 0.5)
+        llh = np.mean(np.log(prob))
+        return [acc, llh]
+
+
+def propagate_measure(measure, gradlnprob, local_cubature, step_size, root_step_size):
+    n = measure.size(0)
+    new_measure = root_step_size*torch.Tensor(local_cubature).repeat(n, 1)
+    gradients = gradlnprob(np.array(measure))
+
+    # THεO POULA adjustment
+    abs_gradient = torch.abs(gradients)
+
+    gradients = torch.div(gradients, 1.0 + root_step_size*abs_gradient)
+    gradients = gradients + torch.div(root_step_size*gradients, 1.0 + abs_gradient)
+
+    next_points = (measure + gradients*step_size).repeat_interleave(len(local_cubature), dim=0)
+    new_measure += next_points
+
+    return new_measure
+
+def produce_local_cubature(dimension):
+    max_power = np.ceil(np.log2(dimension))
+    max_dim = int(2**max_power)
+    hadamard_matrix = hadamard(max_dim)
+    if (dimension == max_dim):
+        return np.sqrt(2.0)*np.vstack((hadamard_matrix, -hadamard_matrix))
+    else:
+        new_matrix = hadamard_matrix[:, :dimension]
+        return np.sqrt(2.0)*np.vstack((new_matrix, -new_matrix))
+
+
+if __name__ == '__main__':
+    data = scipy.io.loadmat('../data/covertype.mat')
+
+    X_input = data['covtype'][:, 1:]
+    y_input = data['covtype'][:, 0]
+    y_input[y_input == 2] = -1
+
+    N = X_input.shape[0]
+    X_input = np.hstack([X_input, np.ones([N, 1])])
+    d = X_input.shape[1]
+    dimension = d + 1
+
+    # split the dataset into training and testing
+    X_train, X_test, y_train, y_test = train_test_split(X_input, y_input, test_size=0.2, random_state=42)
+
+    a0, b0 = 1, 0.01 #hyperparameters
+    model = BayesianLR(X_train, y_train, 100, a0, b0) # batchsize = 100
+
+    # initialization
+    number_of_points = 1024
+    cubature_measure = np.zeros([number_of_points, dimension]);
+    alpha0 = np.random.gamma(a0, b0, number_of_points);
+
+    for i in range(number_of_points):
+        cubature_measure[i, :] = np.hstack([np.random.normal(0, np.sqrt(1 / alpha0[i]), d), np.log(alpha0[i])])
+
+    cubature_measure = torch.Tensor(cubature_measure)
+    local_cubature = produce_local_cubature(dimension)
+
+    points_per_recombine = 128
+    number_of_halving = 4
+    number_of_steps = 1500
+    step_size = 0.01
+    root_step_size = np.sqrt(step_size)
+
+    tic = time.time()
+
+    for i in range(number_of_steps):
+        # Propagate cubature measure
+        big_measure = propagate_measure(cubature_measure, model.dlnprob, local_cubature, step_size, root_step_size)
+
+        # Partition particles into subsets using a ball tree
+        ball_tree = BallTree(big_measure, leaf_size=0.5*points_per_recombine)
+        _, indices, nodes, _ = ball_tree.get_arrays()
+
+        # Resample particles
+        new_indices = [np.random.choice(indices[nd[0]: nd[1]]) for nd in nodes if nd[2]]    
+        cubature_measure = torch.Tensor(big_measure[new_indices])
+
+        if (i+1) % 100 == 0:
+            print('iter ' + str(i+1))
+
+    toc = time.time()   
+
+    print("Time: %.1f seconds" % (toc - tic))
+
+    print("Number of particles: ", len(cubature_measure))
+
+    print("[accuracy, log-likelihood]")
+    print(model.evaluation(cubature_measure, X_test, y_test))
+
diff --git a/scripts/bayesian_logistic_regression_svgd.py b/scripts/bayesian_logistic_regression_svgd.py
@@ -0,0 +1,109 @@
+# This code is taken from https://github.com/dilinwang820/Stein-Variational-Gradient-Descent/blob/master/python/bayesian_logistic_regression.py
+
+import numpy as np
+import scipy.io
+from sklearn.model_selection import train_test_split
+import numpy.matlib as nm
+from svgd import SVGD
+
+import time
+
+'''
+    Example of Bayesian Logistic Regression (the same setting as Gershman et al. 2012):
+    The observed data D = {X, y} consist of N binary class labels, 
+    y_t \in {-1,+1}, and d covariates for each datapoint, X_t \in R^d.
+    The hidden variables \theta = {w, \alpha} consist of d regression coefficients w_k \in R,
+    and a precision parameter \alpha \in R_+. We assume the following model:
+        p(\alpha) = Gamma(\alpha; a, b)
+        p(w_k | a) = N(w_k; 0, \alpha^-1)
+        p(y_t = 1| x_t, w) = 1 / (1+exp(-w^T x_t))
+'''
+class BayesianLR:
+    def __init__(self, X, Y, batchsize=100, a0=1, b0=0.01):
+        self.X, self.Y = X, Y
+        # TODO. Y in \in{+1, -1}
+        self.batchsize = min(batchsize, X.shape[0])
+        self.a0, self.b0 = a0, b0
+
+        self.N = X.shape[0]
+        self.permutation = np.random.permutation(self.N)
+        self.iter = 0
+
+
+    def dlnprob(self, theta):
+
+        if self.batchsize > 0:
+            batch = [ i % self.N for i in range(self.iter * self.batchsize, (self.iter + 1) * self.batchsize) ]
+            ridx = self.permutation[batch]
+            self.iter += 1
+        else:
+            ridx = np.random.permutation(self.X.shape[0])
+
+        Xs = self.X[ridx, :]
+        Ys = self.Y[ridx]
+
+        w = theta[:, :-1]  # logistic weights
+        alpha = np.exp(theta[:, -1])  # the last column is logalpha
+        d = w.shape[1]
+
+        wt = np.multiply((alpha / 2), np.sum(w ** 2, axis=1))
+
+        coff = np.matmul(Xs, w.T)
+        y_hat = 1.0 / (1.0 + np.exp(-1 * coff))
+
+        dw_data = np.matmul(((nm.repmat(np.vstack(Ys), 1, theta.shape[0]) + 1) / 2.0 - y_hat).T, Xs)  # Y \in {-1,1}
+        dw_prior = -np.multiply(nm.repmat(np.vstack(alpha), 1, d) , w)
+        dw = dw_data * 1.0 * self.X.shape[0] / Xs.shape[0] + dw_prior  # re-scale
+
+        dalpha = d / 2.0 - wt + (self.a0 - 1) - self.b0 * alpha + 1  # the last term is the jacobian term
+
+        return np.hstack([dw, np.vstack(dalpha)])  # % first order derivative 
+
+    def evaluation(self, theta, X_test, y_test):
+        theta = theta[:, :-1]
+        M, n_test = theta.shape[0], len(y_test)
+
+        prob = np.zeros([n_test, M])
+        for t in range(M):
+            coff = np.multiply(y_test, np.sum(-1 * np.multiply(nm.repmat(theta[t, :], n_test, 1), X_test), axis=1))
+            prob[:, t] = np.divide(np.ones(n_test), (1 + np.exp(coff)))
+
+        prob = np.mean(prob, axis=1)
+        acc = np.mean(prob > 0.5)
+        llh = np.mean(np.log(prob))
+        return [acc, llh]
+
+if __name__ == '__main__':
+    data = scipy.io.loadmat('../data/covertype.mat')
+
+    X_input = data['covtype'][:, 1:]
+    y_input = data['covtype'][:, 0]
+    y_input[y_input == 2] = -1
+
+    N = X_input.shape[0]
+    X_input = np.hstack([X_input, np.ones([N, 1])])
+    d = X_input.shape[1]
+    D = d + 1
+
+    # split the dataset into training and testing
+    X_train, X_test, y_train, y_test = train_test_split(X_input, y_input, test_size=0.2, random_state=42)
+
+    a0, b0 = 1, 0.01 #hyper-parameters
+    model = BayesianLR(X_train, y_train, 100, a0, b0) # batchsize = 100
+
+    # initialization
+    M = 256  # number of particles
+    theta0 = np.zeros([M, D]);
+    alpha0 = np.random.gamma(a0, b0, M); 
+    for i in range(M):
+        theta0[i, :] = np.hstack([np.random.normal(0, np.sqrt(1 / alpha0[i]), d), np.log(alpha0[i])])
+
+    tic = time.time()
+    theta = SVGD().update(x0=theta0, lnprob=model.dlnprob, bandwidth=-1, n_iter=6000, stepsize=0.05, alpha=0.9, debug=True)
+
+    toc = time.time()
+
+    print("Time: %.1f seconds" % (toc - tic))
+
+    print('[accuracy, log-likelihood]')
+    print(model.evaluation(theta, X_test, y_test))
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,3 @@
		We didn't contribute the data sets here. Please let us know if any conflicts of interest.

		The binary Covertype dataset is from [LIBSVM Data Repository](https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary.html)