Skip to content

Commit

Permalink
Added energy statistic test
Browse files Browse the repository at this point in the history
Added code for the energy statistic test
  • Loading branch information
jmcmahan committed May 4, 2016
1 parent 0134eee commit 1fc9366
Show file tree
Hide file tree
Showing 2 changed files with 113 additions and 0 deletions.
51 changes: 51 additions & 0 deletions EnSt2.m
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
function [e] = EnSt2(X, Y)
%EnSt2 Energy statistic for samples x and y
% Computes the energy statistic for samples X and Y which are
% d by n1 and d by n2 matrices, respectively containing n1, n2
% observations of the d-dimensional variables.

[d, N1] = size(X);
N2 = size(Y,2);
n = N1 + N2;

% Implementation will always assume x has more samples
if N1 < N2
x = Y;
n1 = N2;
y = X;
n2 = N1;
else
x = X;
n1 = N1;
y = Y;
n2 = N2;
end


diff1 = 0;
diff2 = 0;
diff3 = 0;
% All pairwise subtractions implemented by circularly shifting
for j = 1 : n1
xs = circshift(x, [0 j]);
diff1 = diff1 + sumdists(xs(:,1:n2), y);
diff2 = diff2 + sumdists(xs, x);
end

for j = 1 : n2
ys = circshift(y, [0 j]);
diff3 = diff3 + sumdists(ys, y);
end

e = (diff1*2/(n1*n2) - diff2/n1^2 - diff3/n2^2) * n1*n2 / n;

end



function v = sumdists(x, y)
% Computes the sum of the distances between the given sets of
% observations. The number of observations (i.e., number of columns in
% this code) must be equal.
v = sum(sqrt(sum(abs(x - y).^2,1)));
end
62 changes: 62 additions & 0 deletions energy_dist_test.m
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
function [reject, pval, estat, B] = energy_dist_test(s1, s2, alpha, B)
% Do the two-sample energy statistic test for equality of distributions.
% s1 = d-by-N1 set of N1 samples of d-dimensional variables
% s2 = d-by-N2 set of N2 samples of d-dimensional variables
% alpha = significance level to use. The hypothesis that the distributions
% are equal is rejected if more than 100*(1-alpha) percent of
% the permutation samples from the pooled data are smaller than
% the observed value.
% B = number of permutation trials to use for approximating the
% distribution of the test statistic. If not provided, is
% automatically computed from alpha.
%
% The test compares the data in s1 to the data in s2 to see if the samples
% appear to come from the same distribution.
%
% reject = True if rejected at alpha significance level, False otherwise.
% i.e., if true, the data supports the notion that s1 and s2 are
% from different distributions at significance level alpha.
% If false, it is undetermined.
% pval = ratio of permutation samples with energy distance greater than or
% equal to the observed energy distance (includes the observed).
% estat = vector of energy statistics for the B+1 permutation samples
% (includes the non-permuted s1, s2)


if nargin < 4
B = 5*ceil(1/alpha)-1;
end

if 1/(B+1) >= alpha
disp('Warning: Choice of B may not be appropriate for desired signifcance, alpha');
end

% Note, need the same d for s1 and s2.
[d, n1] = size(s1);
[d, n2] = size(s2);

N = n1+n2;

pool = [s1, s2];

estat = zeros(B+1, 1);
eobs = EnSt2(s1, s2);
estat(1) = eobs;

for j = 2:B+1
idx = randsample(N, N);
x = pool(:, idx(1:n1));
y = pool(:, idx(n1+1:end));
estat(j) = EnSt2(x, y);
end

pval = sum(eobs < estat) / (B+1);

if alpha > pval
reject = true;
else
reject = false;
end

end

0 comments on commit 1fc9366

Please sign in to comment.