Distance matrix energy test and MCMC tweaks

- Added the distance-matrix-based energy distance code - Tweaked the proposal matrix initialization of the MCMC algorithm - Acceptance rate check passed out of MCMC routine - Beta fit with uniform prior and uncorrelated noise tested and appears to work
Jeasonxiu · May 5, 2016 · 84b9942 · 84b9942
1 parent 1fc9366
commit 84b9942
Show file tree

Hide file tree

Showing 8 changed files with 184 additions and 8 deletions.
diff --git a/EnSt2DistMat.m b/EnSt2DistMat.m
@@ -0,0 +1,36 @@
+function e = EnSt2DistMat(ind1, ind2, dmat)
+% Compute the energy distance statistic using the pre-computed distance
+% matrix. The "ind1" and "ind2" should be permutations of indices of the
+% pooled sample, each of the size of the two samples being tested. That is,
+% X has some size N1, Y has some size N2, the pooled sample has some size
+% N = N1 + N2, so ind1 is some permutation of 1:N of size N1, and ind2 are
+% the remaining indices not chosen. 
+
+n1 = length(ind1);
+n2 = length(ind2);
+
+N = n1 + n2;
+
+diff1 = 0;
+diff2 = 0;
+diff3 = 0;
+
+for i = 1:n1
+    for j = 1:n2
+        diff1 = diff1 + dist_from_dmat(ind1(i), ind2(j), dmat, N);
+    end
+end
+
+for i = 1:n1
+    for j = 1:n1
+        diff2 = diff2 + dist_from_dmat(ind1(i), ind1(j), dmat, N);
+    end
+end
+
+for i = 1:n2
+    for j = 1:n2
+        diff3 = diff3 + dist_from_dmat(ind2(i), ind2(j), dmat, N);
+    end
+end
+
+e = (diff1*2/(n1*n2) - diff2/n1^2 - diff3/n2^2) * n1*n2 / N;
diff --git a/demo_case1.m b/demo_case1.m
@@ -4,6 +4,7 @@
 % True parameters. These are used to generate data and that
 % data is used to infer these parameters. 
 beta = randn(Nbeta, 1)*10;
+%lambda = 2 + rand*5;
 lambda = 100 + rand*200;
 phi = 0.5;
 
@@ -47,9 +48,13 @@
 % Non-informative prior
 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 
+disp('***********************************************')
+disp('Beta unknown, uncorrelated noise, uniform prior')
+disp('***********************************************')
 post1 = eval_posterior(param1); 
-
-chain = do_simple_mcmc(param1);
+[chain1, acrj1] = do_simple_mcmc(param1, post1);
+disp('Done.')
+disp(' ')
 
 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 % Uncorrelated error 

diff --git a/dist_from_dmat.m b/dist_from_dmat.m
@@ -0,0 +1,17 @@
+function d = dist_from_dmat(i, j, dmat, N)
+% Grab the distance from distance matrix "dmat", which is a vector storing
+% the triangular elements of the pairwise distance matrix.
+
+% Always have i < j
+if j > i
+    k = i;
+    i = j;
+    j = k;
+end
+
+
+% This formula just converts the i,j index into the vector format we've
+% used. 
+
+d = dmat(i + (j-1)*N - (j-1)*j/2);
+end
diff --git a/dist_matrix.m b/dist_matrix.m
@@ -0,0 +1,23 @@
+function [dmat] = dist_matrix(X, Y)
+% Compute the distance matrix for pooled samples X and Y for use
+% with the energy distance code. dmat is a vector holding the 
+% triangular portions of the distance matrix. 
+% X is d by n1, Y is d by n2, with d being the dimension, n1,n2 being
+% the number of samples.
+
+[d, N1] = size(X);
+N2 = size(Y,2);
+
+N = N1 + N2;
+
+dmat = zeros((N+1)*N/2, 1);
+
+Z = [X, Y];
+k = 1;
+for i = 1:N
+    for j=i:N
+        dmat(k) = norm(Z(:,i) - Z(:,j));
+        k = k + 1;
+    end
+end
+end
diff --git a/do_simple_mcmc.m b/do_simple_mcmc.m
@@ -1,13 +1,16 @@
-function qchain = do_simple_mcmc(param, Nchain)
+function [qchain, accrej] = do_simple_mcmc(param, post, Nchain)
 % chain = do_simple_mcmc(param) - Use a Metropolis-Hastings algorithm to generate
 % a 10,000 iterate chain drawn from the posterior of the problem defined by param.
 % 
-% chain = do_simple_mcmc(param, Nchain) - Same as above, but generate Nchain 
+% chain = do_simple_mcmc(param, post) - Same as above but use the posterior 
+% estimates to initialize the proposal matrix.
+%
+% chain = do_simple_mcmc(param, post, Nchain) - Same as above, but generate Nchain 
 % samples.
 %
 % NOTE: This does not use any of the unknown parameters in anyway. 
 
-if nargin < 2
+if nargin < 3
     Nchain = 1e4; 
 end
 
@@ -42,10 +45,11 @@
     qchain(1, Nbeta+2) = param.phi;
 end
 
-res = y - G*param.beta;
+res = y - G*post.mu2;
 s2ols = res'*res / (param.N - Np);
 Ri = eval_corrfuncinv(param);
-V = s2ols*param.lambda*inv(G'*Ri*G);
+%V = s2ols*param.lambda*inv(G'*Ri*G);
+V = s2ols*inv(G'*Ri*G);
 L = chol(V)';
 
 

diff --git a/draw_posterior_sample.m b/draw_posterior_sample.m
@@ -0,0 +1,15 @@
+function sample = draw_posterior_sample(param, post, M)
+% draw_posterior_sample(param, post, N) - Draw N samples from the posterior
+% defined by post for the problem defined by param.
+
+if strcmp(param.unknowns, 'beta')
+    L = chol(post.sigma2 / param.lambda);
+    sample = zeros(M, param.Nbeta);
+    for j = 1:M
+        sample(j, :) = post.mu2' + randn(1, param.Nbeta)*L;
+    end
+elseif strcmp(param.unknowns, 'beta_lambda')
+elseif strcmp(param.unknowns, 'beta_lambda_phi')
+end
+
+end
diff --git a/energy_dist_test.m b/energy_dist_test.m
@@ -44,7 +44,8 @@
 estat(1) = eobs;
 
 for j = 2:B+1
-    idx = randsample(N, N);
+    %idx = randsample(N, N);
+    idx = randperm(N);
     x = pool(:, idx(1:n1));
     y = pool(:, idx(n1+1:end));
     estat(j) = EnSt2(x, y);

diff --git a/energy_dist_test_dmat.m b/energy_dist_test_dmat.m
@@ -0,0 +1,75 @@
+function [reject, pval, estat, B] = energy_dist_test_dmat(s1, s2, alpha, B)
+% Do the two-sample energy statistic test for equality of distributions.
+% s1 = d-by-N1 set of N1 samples of d-dimensional variables
+% s2 = d-by-N2 set of N2 samples of d-dimensional variables
+% alpha = significance level to use. The hypothesis that the distributions
+%         are equal is rejected if more than 100*(1-alpha) percent of
+%         the permutation samples from the pooled data are smaller than
+%         the observed value.
+% B  = number of permutation trials to use for approximating the 
+%      distribution of the test statistic. If not provided, is
+%      automatically computed from alpha. 
+%
+% The test compares the data in s1 to the data in s2 to see if the samples
+% appear to come from the same distribution.
+%
+% reject = True if rejected at alpha significance level, False otherwise.
+%          i.e., if true, the data supports the notion that s1 and s2 are
+%          from different distributions at significance level alpha.
+%          If false, it is undetermined.
+% pval = ratio of permutation samples with energy distance greater than or 
+%        equal to the observed energy distance (includes the observed).
+% estat = vector of energy statistics for the B+1 permutation samples
+%         (includes the non-permuted s1, s2)
+
+
+if nargin < 4
+    B = 5*ceil(1/alpha)-1;
+end
+
+if 1/(B+1) >= alpha
+    disp('Warning: Choice of B may not be appropriate for desired signifcance, alpha');
+end
+
+% Note, need the same d for s1 and s2. 
+[d, n1] = size(s1);
+[d, n2] = size(s2);
+
+dmat = dist_matrix(s1, s2);
+
+N = n1+n2;
+
+pool = [s1, s2];
+
+estat = zeros(B+1, 1);
+%eobs = EnSt2(s1, s2);
+id1 = 1:n1;
+id2 = (n1+1):(n1+n2);
+eobs = EnSt2DistMat(id1, id2, dmat);
+estat(1) = eobs;
+
+for j = 2:B+1
+
+    %idx = randsample(N, N);
+    idx = randperm(N);
+    %x = pool(:, idx(1:n1));
+    %y = pool(:, idx(n1+1:end));
+    %estat(j) = EnSt2(x, y);
+
+    % The distance-matrix-based version used the permuted indices of the pooled
+    % sample, not the samples themselves. 
+    x = idx(1:n1);
+    y = idx(n1+1:end);
+    estat(j) = EnSt2DistMat(x, y, dmat);
+end
+
+pval = sum(eobs < estat) / (B+1);
+
+if alpha > pval
+    reject = true;
+else
+    reject = false;
+end
+
+end
+