-
Notifications
You must be signed in to change notification settings - Fork 19
/
Copy pathvl_nnloss_future.m
286 lines (267 loc) · 10.2 KB
/
vl_nnloss_future.m
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
function Y = vl_nnloss_future(X,c,dzdy,varargin)
% Completely copied from MatConvNet of VLFeat library, from a future version.
%VL_NNLOSS CNN categorical or attribute loss.
% Y = VL_NNLOSS(X, C) computes the loss incurred by the prediction
% scores X given the categorical labels C.
%
% The prediction scores X are organised as a field of prediction
% vectors, represented by a H x W x D x N array. The first two
% dimensions, H and W, are spatial and correspond to the height and
% width of the field; the third dimension D is the number of
% categories or classes; finally, the dimension N is the number of
% data items (images) packed in the array.
%
% While often one has H = W = 1, the case W, H > 1 is useful in
% dense labelling problems such as image segmentation. In the latter
% case, the loss is summed across pixels (contributions can be
% weighed using the `InstanceWeights` option described below).
%
% The array C contains the categorical labels. In the simplest case,
% C is an array of integers in the range [1, D] with N elements
% specifying one label for each of the N images. If H, W > 1, the
% same label is implicitly applied to all spatial locations.
%
% In the second form, C has dimension H x W x 1 x N and specifies a
% categorical label for each spatial location.
%
% In the third form, C has dimension H x W x D x N and specifies
% attributes rather than categories. Here elements in C are either
% +1 or -1 and C, where +1 denotes that an attribute is present and
% -1 that it is not. The key difference is that multiple attributes
% can be active at the same time, while categories are mutually
% exclusive. By default, the loss is *summed* across attributes
% (unless otherwise specified using the `InstanceWeights` option
% described below).
%
% DZDX = VL_NNLOSS(X, C, DZDY) computes the derivative of the block
% projected onto the output derivative DZDY. DZDX and DZDY have the
% same dimensions as X and Y respectively.
%
% VL_NNLOSS() supports several loss functions, which can be selected
% by using the option `type` described below. When each scalar c in
% C is interpreted as a categorical label (first two forms above),
% the following losses can be used:
%
% Classification error:: `classerror`
% L(X,c) = (argmax_q X(q) ~= c). Note that the classification
% error derivative is flat; therefore this loss is useful for
% assessment, but not for training a model.
%
% Top-K classification error:: `topkerror`
% L(X,c) = (rank X(c) in X <= K). The top rank is the one with
% highest score. For K=1, this is the same as the
% classification error. K is controlled by the `topK` option.
%
% Log loss:: `log`
% L(X,c) = - log(X(c)). This function assumes that X(c) is the
% predicted probability of class c (hence the vector X must be non
% negative and sum to one).
%
% Softmax log loss (multinomial logistic loss):: `softmaxlog`
% L(X,c) = - log(P(c)) where P(c) = exp(X(c)) / sum_q exp(X(q)).
% This is the same as the `log` loss, but renormalizes the
% predictions using the softmax function.
%
% Multiclass hinge loss:: `mhinge`
% L(X,c) = max{0, 1 - X(c)}. This function assumes that X(c) is
% the score margin for class c against the other classes. See
% also the `mmhinge` loss below.
%
% Multiclass structured hinge loss:: `mshinge`
% L(X,c) = max{0, 1 - M(c)} where M(c) = X(c) - max_{q ~= c}
% X(q). This is the same as the `mhinge` loss, but computes the
% margin between the prediction scores first. This is also known
% the Crammer-Singer loss, an example of a structured prediction
% loss.
%
% When C is a vector of binary attribures c in (+1,-1), each scalar
% prediction score x is interpreted as voting for the presence or
% absence of a particular attribute. The following losses can be
% used:
%
% Binary classification error:: `binaryerror`
% L(x,c) = (sign(x - t) ~= c). t is a threshold that can be
% specified using the `threshold` option and defaults to zero. If
% x is a probability, it should be set to 0.5.
%
% Binary log loss:: `binarylog`
% L(x,c) = - log(c(x-0.5) + 0.5). x is assumed to be the
% probability that the attribute is active (c=+1). Hence x must be
% a number in the range [0,1]. This is the binary version of the
% `log` loss.
%
% Logistic log loss:: `logisticlog`
% L(x,c) = log(1 + exp(- cx)). This is the same as the `binarylog`
% loss, but implicitly normalizes the score x into a probability
% using the logistic (sigmoid) function: p = sigmoid(x) = 1 / (1 +
% exp(-x)). This is also equivalent to `softmaxlog` loss where
% class c=+1 is assigned score x and class c=-1 is assigned score
% 0.
%
% Hinge loss:: `hinge`
% L(x,c) = max{0, 1 - cx}. This is the standard hinge loss for
% binary classification. This is equivalent to the `mshinge` loss
% if class c=+1 is assigned score x and class c=-1 is assigned
% score 0.
%
% VL_NNLOSS(...,'OPT', VALUE, ...) supports these additionals
% options:
%
% InstanceWeights:: []
% Allows to weight the loss as L'(x,c) = WGT L(x,c), where WGT is
% a per-instance weight extracted from the array
% `InstanceWeights`. For categorical losses, this is either a H x
% W x 1 or a H x W x 1 x N array. For attribute losses, this is
% either a H x W x D or a H x W x D x N array.
%
% TopK:: 5
% Top-K value for the top-K error. Note that K should not
% exceed the number of labels.
%
% See also: VL_NNSOFTMAX().
% Copyright (C) 2014-15 Andrea Vedaldi.
% All rights reserved.
%
% This file is part of the VLFeat library and is made available under
% the terms of the BSD license (see the COPYING file).
opts.instanceWeights = [] ;
opts.classWeights = [] ;
opts.threshold = 0 ;
opts.loss = 'softmaxlog' ;
opts.topK = 5 ;
opts = vl_argparse(opts,varargin) ;
inputSize = [size(X,1) size(X,2) size(X,3) size(X,4)] ;
% Form 1: C has one label per image. In this case, get C in form 2 or
% form 3.
c = gather(c) ;
if numel(c) == inputSize(4)
c = reshape(c, [1 1 1 inputSize(4)]) ;
c = repmat(c, inputSize(1:2)) ;
end
if isa(X,'gpuArray')
dataType = classUnderlying(X) ;
else
dataType = class(X) ;
end
switch dataType
case 'double', toClass = @(x) double(x) ;
case 'single', toClass = @(x) single(x) ;
end
% --------------------------------------------------------------------
% Spatial weighting
% --------------------------------------------------------------------
labelSize = [size(c,1) size(c,2) size(c,3) size(c,4)] ;
assert(isequal(labelSize(1:2), inputSize(1:2))) ;
assert(labelSize(4) == inputSize(4)) ;
switch lower(opts.loss)
case {'classerror', 'topkerror', 'log', 'softmaxlog', 'mhinge', 'mshinge'}
binary = false ;
% there must be one categorical label per prediction vector
assert(labelSize(3) == 1) ;
% null labels denote instances that should be skipped
instanceWeights = toClass(c(:,:,1,:) ~= 0) ;
case {'binaryerror', 'binarylog', 'logistic', 'hinge'}
binary = true ;
% there must be one categorical label per prediction scalar
assert(labelSize(3) == inputSize(3)) ;
% null labels denote instances that should be skipped
instanceWeights = toClass(c ~= 0) ;
otherwise
error('Unknown loss ''%s''.', opts.loss) ;
end
if ~isempty(opts.instanceWeights)
instanceWeights = bsxfun(@times, instanceWeights, opts.instanceWeights) ;
end
% --------------------------------------------------------------------
% Do the work
% --------------------------------------------------------------------
switch lower(opts.loss)
case {'log', 'softmaxlog', 'mhinge', 'mshinge'}
% from category labels to indexes
numPixelsPerImage = prod(inputSize(1:2)) ;
numPixels = numPixelsPerImage * inputSize(4) ;
imageVolume = numPixelsPerImage * inputSize(3) ;
n = reshape(0:numPixels-1,labelSize) ;
offset = 1 + mod(n, numPixelsPerImage) + ...
imageVolume * fix(n / numPixelsPerImage) ;
ci = offset + numPixelsPerImage * max(c - 1,0) ;
end
if nargin <= 2 || isempty(dzdy)
switch lower(opts.loss)
case 'classerror'
[~,chat] = max(X,[],3) ;
t = toClass(c ~= chat) ;
case 'topkerror'
[~,predictions] = sort(X,3,'descend') ;
t = 1 - sum(bsxfun(@eq, c, predictions(:,:,1:opts.topK,:)), 3) ;
case 'log'
t = - log(X(ci)) ;
case 'softmaxlog'
Xmax = max(X,[],3) ;
ex = exp(bsxfun(@minus, X, Xmax)) ;
t = Xmax + log(sum(ex,3)) - X(ci) ;
case 'mhinge'
t = max(0, 1 - X(ci)) ;
case 'mshinge'
Q = X ;
Q(ci) = -inf ;
t = max(0, 1 - X(ci) + max(Q,[],3)) ;
case 'binaryerror'
t = toClass(sign(X - opts.threshold) ~= c) ;
case 'binarylog'
t = -log(c.*(X-0.5) + 0.5) ;
case 'logistic'
%t = log(1 + exp(-c.*X)) ;
a = -c.*X ;
b = max(0, a) ;
t = b + log(exp(-b) + exp(a-b)) ;
case 'hinge'
t = max(0, 1 - c.*X) ;
end
Y = instanceWeights(:)' * t(:) ;
else
dzdy = dzdy * instanceWeights ;
switch lower(opts.loss)
case {'classerror', 'topkerror'}
Y = zerosLike(X) ;
case 'log'
Y = zerosLike(X) ;
Y(ci) = - dzdy ./ max(X(ci), 1e-8) ;
case 'softmaxlog'
Xmax = max(X,[],3) ;
ex = exp(bsxfun(@minus, X, Xmax)) ;
Y = bsxfun(@rdivide, ex, sum(ex,3)) ;
Y(ci) = Y(ci) - 1 ;
Y = bsxfun(@times, dzdy, Y) ;
case 'mhinge'
Y = zerosLike(X) ;
Y(ci) = - dzdy .* (X(ci) < 1) ;
case 'mshinge'
Q = X ;
Q(ci) = -inf ;
[~, q] = max(Q,[],3) ;
qi = offset + numPixelsPerImage * (q - 1) ;
W = dzdy .* (X(ci) - X(qi) < 1) ;
Y = zerosLike(X) ;
Y(ci) = - W ;
Y(qi) = + W ;
case 'binaryerror'
Y = zerosLike(X) ;
case 'binarylog'
Y = - dzdy ./ (X + (c-1)*0.5) ;
case 'logistic'
% t = exp(-Y.*X) / (1 + exp(-Y.*X)) .* (-Y)
% t = 1 / (1 + exp(Y.*X)) .* (-Y)
Y = - dzdy .* c ./ (1 + exp(c.*X)) ;
case 'hinge'
Y = - dzdy .* c .* (c.*X < 1) ;
end
end
% --------------------------------------------------------------------
function y = zerosLike(x)
% --------------------------------------------------------------------
if isa(x,'gpuArray')
y = gpuArray.zeros(size(x),classUnderlying(x)) ;
else
y = zeros(size(x),'like',x) ;
end