-
Notifications
You must be signed in to change notification settings - Fork 313
/
EN_DistributionEntropy.m
131 lines (121 loc) · 5.18 KB
/
EN_DistributionEntropy.m
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
function out = EN_DistributionEntropy(y,histOrKS,numBins,olremp)
% EN_DistributionEntropy Distributional entropy.
%
% Estimates of entropy from the distribution of a data vector. The
% distribution is estimated either using a histogram with numBins bins, or as a
% kernel-smoothed distribution, using the ksdensity function from Matlab's
% Statistics Toolbox with width parameter, w (specified as the iunput numBins).
%
% An optional additional parameter can be used to remove a proportion of the
% most extreme positive and negative deviations from the mean as an initial
% pre-processing.
%
%---INPUTS:
%
% y, the input time series
%
% histOrKS: 'hist' for histogram, or 'ks' for ksdensity
%
% numBins: (*) (for 'hist'): an integer, uses a histogram with that many bins
% (*) (for 'ks'): a positive real number, for the width parameter for
% ksdensity (can also be empty for default width
% parameter, optimum for Gaussian)
%
% olremp [opt]: the proportion of outliers at both extremes to remove
% (e.g., if olremp = 0.01; keeps only the middle 98% of data; 0
% keeps all data. This parameter ought to be less than 0.5, which
% keeps none of the data).
% If olremp is specified, returns the difference in entropy from
% removing the outliers.
% ------------------------------------------------------------------------------
% Copyright (C) 2020, Ben D. Fulcher <ben.d.fulcher@gmail.com>,
% <http://www.benfulcher.com>
%
% If you use this code for your research, please cite the following two papers:
%
% (1) B.D. Fulcher and N.S. Jones, "hctsa: A Computational Framework for Automated
% Time-Series Phenotyping Using Massive Feature Extraction, Cell Systems 5: 527 (2017).
% DOI: 10.1016/j.cels.2017.10.001
%
% (2) B.D. Fulcher, M.A. Little, N.S. Jones, "Highly comparative time-series
% analysis: the empirical structure of time series and their methods",
% J. Roy. Soc. Interface 10(83) 20130048 (2013).
% DOI: 10.1098/rsif.2013.0048
%
% This function is free software: you can redistribute it and/or modify it under
% the terms of the GNU General Public License as published by the Free Software
% Foundation, either version 3 of the License, or (at your option) any later
% version.
%
% This program is distributed in the hope that it will be useful, but WITHOUT
% ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
% FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
% details.
%
% You should have received a copy of the GNU General Public License along with
% this program. If not, see <http://www.gnu.org/licenses/>.
% ------------------------------------------------------------------------------
doPlot = 0; % plot outputs to figure
% ------------------------------------------------------------------------------
%% Check inputs
% ------------------------------------------------------------------------------
if nargin < 2 || isempty(histOrKS)
histOrKS = 'hist'; % use histogram by default
end
if nargin < 3 % (can be empty for default width for ksdensity)
numBins = 10; % use 10 bins
end
if nargin < 4
olremp = 0;
end
% ------------------------------------------------------------------------------
% (1) Remove outliers?
% ------------------------------------------------------------------------------
if olremp ~= 0
yHat = y(y >= quantile(y,olremp) & y <= quantile(y,1-olremp));
if isempty(yHat)
% removed the entire time series?!
% shouldn't be possible for good values of olremp with equality
% in the above inequalities
out = NaN; return
else
% Return the difference in entropy from removing outliers
out = EN_DistributionEntropy(y,histOrKS,numBins) - ...
EN_DistributionEntropy(yHat,histOrKS,numBins);
return
end
end
% ------------------------------------------------------------------------------
% (2) Form the histogram
% ------------------------------------------------------------------------------
switch histOrKS
case 'hist' % Use histogram to calculate pdf
if isnumeric(numBins)
[px,binEdges] = histcounts(y,numBins,'Normalization','probability');
else
[px,binEdges] = histcounts(y,'BinMethod',numBins,'Normalization','probability');
end
% Compute bin centers:
xr = mean([binEdges(1:end-1); binEdges(2:end)]);
% Compute bin widths:
binWidths = diff(binEdges);
case 'ks' % Use ksdensity to calculate pdf
if isempty(numBins)
[px, xr] = ksdensity(y,'function','pdf'); % selects optimal width
else
[px, xr] = ksdensity(y,'width',numBins,'function','pdf'); % uses specified width
end
binWidths = ones(1,length(px))*(xr(2)-xr(1));
otherwise
error('Unknown distribution method -- specify ''ks'' or ''hist''') % error; must specify 'ks' or 'hist'
end
if doPlot
figure('color','w'); box('on');
plot(xr,px,'k')
end
% ------------------------------------------------------------------------------
% (3) Compute the entropy sum and return it as output
% ------------------------------------------------------------------------------
% 0*log0 = 0:
out = -sum(px(px>0).*log(px(px>0)./binWidths(px>0)));
end