Skip to content

Commit

Permalink
Distance matrix energy test and MCMC tweaks
Browse files Browse the repository at this point in the history
- Added the distance-matrix-based energy distance code
- Tweaked the proposal matrix initialization of the MCMC algorithm
- Acceptance rate check passed out of MCMC routine
- Beta fit with uniform prior and uncorrelated noise tested and appears
to work
  • Loading branch information
jmcmahan committed May 5, 2016
1 parent 1fc9366 commit 84b9942
Show file tree
Hide file tree
Showing 8 changed files with 184 additions and 8 deletions.
36 changes: 36 additions & 0 deletions EnSt2DistMat.m
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
function e = EnSt2DistMat(ind1, ind2, dmat)
% Compute the energy distance statistic using the pre-computed distance
% matrix. The "ind1" and "ind2" should be permutations of indices of the
% pooled sample, each of the size of the two samples being tested. That is,
% X has some size N1, Y has some size N2, the pooled sample has some size
% N = N1 + N2, so ind1 is some permutation of 1:N of size N1, and ind2 are
% the remaining indices not chosen.

n1 = length(ind1);
n2 = length(ind2);

N = n1 + n2;

diff1 = 0;
diff2 = 0;
diff3 = 0;

for i = 1:n1
for j = 1:n2
diff1 = diff1 + dist_from_dmat(ind1(i), ind2(j), dmat, N);
end
end

for i = 1:n1
for j = 1:n1
diff2 = diff2 + dist_from_dmat(ind1(i), ind1(j), dmat, N);
end
end

for i = 1:n2
for j = 1:n2
diff3 = diff3 + dist_from_dmat(ind2(i), ind2(j), dmat, N);
end
end

e = (diff1*2/(n1*n2) - diff2/n1^2 - diff3/n2^2) * n1*n2 / N;
9 changes: 7 additions & 2 deletions demo_case1.m
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
% True parameters. These are used to generate data and that
% data is used to infer these parameters.
beta = randn(Nbeta, 1)*10;
%lambda = 2 + rand*5;
lambda = 100 + rand*200;
phi = 0.5;

Expand Down Expand Up @@ -47,9 +48,13 @@
% Non-informative prior
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

disp('***********************************************')
disp('Beta unknown, uncorrelated noise, uniform prior')
disp('***********************************************')
post1 = eval_posterior(param1);

chain = do_simple_mcmc(param1);
[chain1, acrj1] = do_simple_mcmc(param1, post1);
disp('Done.')
disp(' ')

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% Uncorrelated error
Expand Down
17 changes: 17 additions & 0 deletions dist_from_dmat.m
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
function d = dist_from_dmat(i, j, dmat, N)
% Grab the distance from distance matrix "dmat", which is a vector storing
% the triangular elements of the pairwise distance matrix.

% Always have i < j
if j > i
k = i;
i = j;
j = k;
end


% This formula just converts the i,j index into the vector format we've
% used.

d = dmat(i + (j-1)*N - (j-1)*j/2);
end
23 changes: 23 additions & 0 deletions dist_matrix.m
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
function [dmat] = dist_matrix(X, Y)
% Compute the distance matrix for pooled samples X and Y for use
% with the energy distance code. dmat is a vector holding the
% triangular portions of the distance matrix.
% X is d by n1, Y is d by n2, with d being the dimension, n1,n2 being
% the number of samples.

[d, N1] = size(X);
N2 = size(Y,2);

N = N1 + N2;

dmat = zeros((N+1)*N/2, 1);

Z = [X, Y];
k = 1;
for i = 1:N
for j=i:N
dmat(k) = norm(Z(:,i) - Z(:,j));
k = k + 1;
end
end
end
14 changes: 9 additions & 5 deletions do_simple_mcmc.m
Original file line number Diff line number Diff line change
@@ -1,13 +1,16 @@
function qchain = do_simple_mcmc(param, Nchain)
function [qchain, accrej] = do_simple_mcmc(param, post, Nchain)
% chain = do_simple_mcmc(param) - Use a Metropolis-Hastings algorithm to generate
% a 10,000 iterate chain drawn from the posterior of the problem defined by param.
%
% chain = do_simple_mcmc(param, Nchain) - Same as above, but generate Nchain
% chain = do_simple_mcmc(param, post) - Same as above but use the posterior
% estimates to initialize the proposal matrix.
%
% chain = do_simple_mcmc(param, post, Nchain) - Same as above, but generate Nchain
% samples.
%
% NOTE: This does not use any of the unknown parameters in anyway.

if nargin < 2
if nargin < 3
Nchain = 1e4;
end

Expand Down Expand Up @@ -42,10 +45,11 @@
qchain(1, Nbeta+2) = param.phi;
end

res = y - G*param.beta;
res = y - G*post.mu2;
s2ols = res'*res / (param.N - Np);
Ri = eval_corrfuncinv(param);
V = s2ols*param.lambda*inv(G'*Ri*G);
%V = s2ols*param.lambda*inv(G'*Ri*G);
V = s2ols*inv(G'*Ri*G);
L = chol(V)';


Expand Down
15 changes: 15 additions & 0 deletions draw_posterior_sample.m
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
function sample = draw_posterior_sample(param, post, M)
% draw_posterior_sample(param, post, N) - Draw N samples from the posterior
% defined by post for the problem defined by param.

if strcmp(param.unknowns, 'beta')
L = chol(post.sigma2 / param.lambda);
sample = zeros(M, param.Nbeta);
for j = 1:M
sample(j, :) = post.mu2' + randn(1, param.Nbeta)*L;
end
elseif strcmp(param.unknowns, 'beta_lambda')
elseif strcmp(param.unknowns, 'beta_lambda_phi')
end

end
3 changes: 2 additions & 1 deletion energy_dist_test.m
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,8 @@
estat(1) = eobs;

for j = 2:B+1
idx = randsample(N, N);
%idx = randsample(N, N);
idx = randperm(N);
x = pool(:, idx(1:n1));
y = pool(:, idx(n1+1:end));
estat(j) = EnSt2(x, y);
Expand Down
75 changes: 75 additions & 0 deletions energy_dist_test_dmat.m
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
function [reject, pval, estat, B] = energy_dist_test_dmat(s1, s2, alpha, B)
% Do the two-sample energy statistic test for equality of distributions.
% s1 = d-by-N1 set of N1 samples of d-dimensional variables
% s2 = d-by-N2 set of N2 samples of d-dimensional variables
% alpha = significance level to use. The hypothesis that the distributions
% are equal is rejected if more than 100*(1-alpha) percent of
% the permutation samples from the pooled data are smaller than
% the observed value.
% B = number of permutation trials to use for approximating the
% distribution of the test statistic. If not provided, is
% automatically computed from alpha.
%
% The test compares the data in s1 to the data in s2 to see if the samples
% appear to come from the same distribution.
%
% reject = True if rejected at alpha significance level, False otherwise.
% i.e., if true, the data supports the notion that s1 and s2 are
% from different distributions at significance level alpha.
% If false, it is undetermined.
% pval = ratio of permutation samples with energy distance greater than or
% equal to the observed energy distance (includes the observed).
% estat = vector of energy statistics for the B+1 permutation samples
% (includes the non-permuted s1, s2)


if nargin < 4
B = 5*ceil(1/alpha)-1;
end

if 1/(B+1) >= alpha
disp('Warning: Choice of B may not be appropriate for desired signifcance, alpha');
end

% Note, need the same d for s1 and s2.
[d, n1] = size(s1);
[d, n2] = size(s2);

dmat = dist_matrix(s1, s2);

N = n1+n2;

pool = [s1, s2];

estat = zeros(B+1, 1);
%eobs = EnSt2(s1, s2);
id1 = 1:n1;
id2 = (n1+1):(n1+n2);
eobs = EnSt2DistMat(id1, id2, dmat);
estat(1) = eobs;

for j = 2:B+1

%idx = randsample(N, N);
idx = randperm(N);
%x = pool(:, idx(1:n1));
%y = pool(:, idx(n1+1:end));
%estat(j) = EnSt2(x, y);

% The distance-matrix-based version used the permuted indices of the pooled
% sample, not the samples themselves.
x = idx(1:n1);
y = idx(n1+1:end);
estat(j) = EnSt2DistMat(x, y, dmat);
end

pval = sum(eobs < estat) / (B+1);

if alpha > pval
reject = true;
else
reject = false;
end

end

0 comments on commit 84b9942

Please sign in to comment.