Added energy statistic test

Added code for the energy statistic test
Jeasonxiu · May 4, 2016 · 1fc9366 · 1fc9366
1 parent 0134eee
commit 1fc9366
Show file tree

Hide file tree

Showing 2 changed files with 113 additions and 0 deletions.
diff --git a/EnSt2.m b/EnSt2.m
@@ -0,0 +1,51 @@
+function [e] = EnSt2(X, Y)
+%EnSt2 Energy statistic for samples x and y
+%   Computes the energy statistic for samples X and Y which are 
+%   d by n1 and d by n2 matrices, respectively containing n1, n2 
+%   observations of the d-dimensional variables.
+
+[d, N1] = size(X);
+N2 = size(Y,2);
+n = N1 + N2;
+
+% Implementation will always assume x has more samples
+if N1 < N2
+    x = Y;
+    n1 = N2;
+    y = X;
+    n2 = N1;
+else
+    x = X;
+    n1 = N1;
+    y = Y;
+    n2 = N2;
+end
+
+
+diff1 = 0;
+diff2 = 0;
+diff3 = 0;
+% All pairwise subtractions implemented by circularly shifting 
+for j = 1 : n1    
+    xs = circshift(x, [0 j]);
+    diff1 = diff1 + sumdists(xs(:,1:n2), y);
+    diff2 = diff2 + sumdists(xs, x);
+end
+
+for j = 1 : n2
+    ys = circshift(y, [0 j]);
+    diff3 = diff3 + sumdists(ys, y);
+end
+
+e = (diff1*2/(n1*n2) - diff2/n1^2 - diff3/n2^2) * n1*n2 / n;
+
+end
+
+
+
+function v = sumdists(x, y)
+% Computes the sum of the distances between the given sets of 
+% observations. The number of observations (i.e., number of columns in
+% this code) must be equal.
+    v = sum(sqrt(sum(abs(x - y).^2,1)));
+end
diff --git a/energy_dist_test.m b/energy_dist_test.m
@@ -0,0 +1,62 @@
+function [reject, pval, estat, B] = energy_dist_test(s1, s2, alpha, B)
+% Do the two-sample energy statistic test for equality of distributions.
+% s1 = d-by-N1 set of N1 samples of d-dimensional variables
+% s2 = d-by-N2 set of N2 samples of d-dimensional variables
+% alpha = significance level to use. The hypothesis that the distributions
+%         are equal is rejected if more than 100*(1-alpha) percent of
+%         the permutation samples from the pooled data are smaller than
+%         the observed value.
+% B  = number of permutation trials to use for approximating the 
+%      distribution of the test statistic. If not provided, is
+%      automatically computed from alpha. 
+%
+% The test compares the data in s1 to the data in s2 to see if the samples
+% appear to come from the same distribution.
+%
+% reject = True if rejected at alpha significance level, False otherwise.
+%          i.e., if true, the data supports the notion that s1 and s2 are
+%          from different distributions at significance level alpha.
+%          If false, it is undetermined.
+% pval = ratio of permutation samples with energy distance greater than or 
+%        equal to the observed energy distance (includes the observed).
+% estat = vector of energy statistics for the B+1 permutation samples
+%         (includes the non-permuted s1, s2)
+
+
+if nargin < 4
+    B = 5*ceil(1/alpha)-1;
+end
+
+if 1/(B+1) >= alpha
+    disp('Warning: Choice of B may not be appropriate for desired signifcance, alpha');
+end
+
+% Note, need the same d for s1 and s2. 
+[d, n1] = size(s1);
+[d, n2] = size(s2);
+
+N = n1+n2;
+
+pool = [s1, s2];
+
+estat = zeros(B+1, 1);
+eobs = EnSt2(s1, s2);
+estat(1) = eobs;
+
+for j = 2:B+1
+    idx = randsample(N, N);
+    x = pool(:, idx(1:n1));
+    y = pool(:, idx(n1+1:end));
+    estat(j) = EnSt2(x, y);
+end
+
+pval = sum(eobs < estat) / (B+1);
+
+if alpha > pval
+    reject = true;
+else
+    reject = false;
+end
+
+end
+