7
7
using Flux
8
8
using Statistics
9
9
using CUDA
10
- using DelimitedFiles
10
+
11
+ const MAX_THREADS = 256
11
12
12
13
function log_plus_f (p1, p2)
13
14
@@ -51,24 +52,21 @@ function computeAlphaKernel(probs, labelSize, uttLength, repeats, labelsWithoutB
51
52
# Fill in first column (time step)
52
53
i = tid
53
54
while i <= last - start
54
- alpha[start + i ] = probs[labels[start + i] ]
55
+ alpha[start+ i, 1 ] = probs[labels[start+ i], 1 ]
55
56
i += blockDim (). x
56
57
end
57
58
58
59
sync_threads ()
59
60
60
61
# Fill in coefficients for each time step
61
62
for t= 2 : T
62
- startCurCol = (t- 1 ) * S
63
- startPrevCol = (t- 2 ) * S
64
- startProbCol = (t- 1 ) * div (length (probs), T)
65
63
66
64
# Corner-case checking
67
65
if tid == 1 && ! (1 < S - 2 * (T- t) - 1 )
68
66
if start == 0
69
- alpha[startCurCol + 1 ] = probs[startProbCol + blankLabel] + alpha[startPrevCol + 1 ]
67
+ alpha[1 , t ] = probs[blankLabel, t ] + alpha[1 , t - 1 ]
70
68
elseif start == 1
71
- alpha[startCurCol + 1 ] = alpha[startPrevCol + 1 ]
69
+ alpha[1 , t ] = alpha[1 , t - 1 ]
72
70
end
73
71
end
74
72
@@ -79,16 +77,16 @@ function computeAlphaKernel(probs, labelSize, uttLength, repeats, labelsWithoutB
79
77
idx = tid+ 1
80
78
while idx <= S
81
79
82
- prevSum = log_plus_f (alpha[startPrevCol + idx], alpha[startPrevCol + idx- 1 ])
80
+ prevSum = log_plus_f (alpha[idx, t - 1 ], alpha[idx- 1 , t - 1 ])
83
81
84
82
if labels[idx] != blankLabel && idx != 2 && labels[idx] != labels[idx- 2 ]
85
- prevSum = log_plus_f (prevSum, alpha[startPrevCol + idx- 2 ])
83
+ prevSum = log_plus_f (prevSum, alpha[idx- 2 , t - 1 ])
86
84
end
87
85
88
86
if idx < S - 2 * (T- t) - 1
89
- alpha[idx + startCurCol ] = - Inf32
87
+ alpha[idx, t ] = - Inf32
90
88
else
91
- alpha[startCurCol + idx] = prevSum + probs[startProbCol + labels[idx]]
89
+ alpha[idx, t ] = prevSum + probs[labels[idx], t ]
92
90
end
93
91
94
92
idx += blockDim (). x
@@ -122,52 +120,40 @@ function computeBetasAndGradKernel(probs, labelSize, uttLength,
122
120
123
121
sync_threads ()
124
122
125
-
126
- startCurCol = (T- 1 )* S
127
- startProbCol = (T- 1 ) * div (length (probs), T)
128
-
129
123
i = tid
130
124
131
125
# Calculate coefficients for last column (time step)
132
126
# then determine alpha and beta product
133
127
while i <= last - start + 1
134
-
135
- beta[startCurCol + i + start] = 0
136
- output[startCurCol + i + start] = beta[startCurCol + i + start] + alphas[startCurCol + i + start]
128
+ beta[i+ start, T] = 0
129
+ output[i+ start, T] = beta[i+ start, T] + alphas[i+ start, T]
137
130
i += blockDim (). x
138
131
end
139
132
140
133
sync_threads ()
141
134
142
135
# Fill in `accum` for last column (time step)
143
- if tid == 1
144
- startAccCol = startProbCol
145
- startOutputCol = startCurCol
146
-
136
+ if tid == 1
147
137
for i= 1 : S
148
138
labelIdx = labels[i]
149
- accum[startAccCol + labelIdx] = log_plus_f (accum[startAccCol + labelIdx], output[startOutputCol + i ])
139
+ accum[labelIdx, T ] = log_plus_f (accum[labelIdx, T ], output[i, T ])
150
140
end
151
141
end
152
142
153
143
sync_threads ()
154
144
155
145
# Fill in `grad` for last column (time step)
156
146
idx = tid
157
- # while idx <= CUDA.div_fast(Float32(length(grad)), Float32(T))
158
147
while idx <= size (grad, 1 )
159
- #
160
- startProbCol = (T - 1 ) * div (length (probs), T)
161
- startOutputCol = (T - 1 ) * S
162
148
163
149
s = - Inf32
150
+
164
151
for i= 1 : S
165
- s = log_plus_f (s, output[startOutputCol + i ])
152
+ s = log_plus_f (s, output[i, T ])
166
153
end
167
154
168
155
# ∂L/∂a (where a is activation before logsoftmax)
169
- # grad[startProbCol + idx] = CUDA.exp(probs[startProbCol + idx]) - CUDA.exp(accum[startProbCol + idx] - s)
170
- grad[idx, T] = CUDA. exp (probs[startProbCol + idx]) - CUDA. exp (accum[startProbCol + idx] - s)
156
+ grad[idx, T] = CUDA. exp (probs[idx, T]) - CUDA. exp (accum[idx, T] - s)
171
157
idx += blockDim (). x
172
158
end
173
159
@@ -176,28 +162,23 @@ function computeBetasAndGradKernel(probs, labelSize, uttLength,
176
162
# Fill in the rest of the coefficients
177
163
t = T- 1
178
164
while t >= 1
179
-
180
- startCurCol = (t- 1 )* S
181
- startNextCol = t* S
182
- startProbCol = t * div (length (probs), T)
183
-
184
165
if t < T
185
166
186
167
idx = tid
187
168
while idx <= S- 1
188
169
189
- nextSum = log_plus_f (beta[startNextCol + idx ] + probs[startProbCol + labels[idx]],
190
- beta[startNextCol + idx + 1 ] + probs[startProbCol + labels[idx+ 1 ]])
170
+ nextSum = log_plus_f (beta[idx, t + 1 ] + probs[labels[idx], t + 1 ],
171
+ beta[idx + 1 , t + 1 ] + probs[labels[idx+ 1 ], t + 1 ])
191
172
192
173
if labels[idx] != blankLabel && idx != S- 1 && labels[idx] != labels[idx+ 2 ]
193
174
nextSum = log_plus_f (nextSum,
194
- beta[startNextCol + idx + 2 ] + probs[startProbCol + labels[idx+ 2 ]])
175
+ beta[idx + 2 , t + 1 ] + probs[labels[idx+ 2 ], t + 1 ])
195
176
end
196
177
197
178
if idx > 2 * t
198
- beta[idx + startCurCol ] = - Inf32
179
+ beta[idx, t ] = - Inf32
199
180
else
200
- beta[idx + startCurCol ] = nextSum
181
+ beta[idx, t ] = nextSum
201
182
202
183
end
203
184
@@ -207,14 +188,14 @@ function computeBetasAndGradKernel(probs, labelSize, uttLength,
207
188
sync_threads ()
208
189
209
190
if tid == 1 && last == S
210
- beta[startCurCol + S ] = beta[startNextCol + S ] + probs[startProbCol + blankLabel ]
191
+ beta[S, t ] = beta[S, t ] + probs[blankLabel, t + 1 ]
211
192
end
212
193
213
194
sync_threads ()
214
195
215
196
idx = tid
216
197
while idx <= S
217
- output[startCurCol + idx] = alphas[idx+ startCurCol ] + beta[startCurCol + idx]
198
+ output[idx, t ] = alphas[idx, t ] + beta[idx, t ]
218
199
idx += blockDim (). x
219
200
end
220
201
@@ -226,14 +207,10 @@ function computeBetasAndGradKernel(probs, labelSize, uttLength,
226
207
227
208
# Calculate accumulated alpha-beta products for each label class for
228
209
# each time step; used in calculating gradients
229
- if tid == 1
230
-
231
- startAccCol = (t- 1 ) * div (length (accum), T)
232
- startOutputCol = (t- 1 ) * S
233
-
210
+ if tid == 1
234
211
for i= 1 : S
235
212
labelIdx = labels[i]
236
- accum[startAccCol + labelIdx] = log_plus_f (accum[startAccCol + labelIdx], output[startOutputCol + i ])
213
+ accum[labelIdx, t ] = log_plus_f (accum[labelIdx, t ], output[i, t ])
237
214
end
238
215
end
239
216
@@ -243,17 +220,15 @@ function computeBetasAndGradKernel(probs, labelSize, uttLength,
243
220
244
221
# Calculate gradients
245
222
while idx <= size (grad, 1 )
246
- #
247
- startProbCol = (t - 1 ) * div (length (probs), T)
248
- startOutputCol = (t - 1 ) * S
249
223
250
224
s = - Inf32
225
+
251
226
for i= 1 : S
252
- s = log_plus_f (s, output[startOutputCol + i ])
227
+ s = log_plus_f (s, output[i, t ])
253
228
end
254
229
255
230
# ∂L/∂a (where a is activation before logsoftmax)
256
- grad[idx, t] = CUDA. exp (probs[startProbCol + idx]) - CUDA. exp (accum[startProbCol + idx] - s)
231
+ grad[idx, t] = CUDA. exp (probs[idx, t ]) - CUDA. exp (accum[idx, t ] - s)
257
232
idx += blockDim (). x
258
233
end
259
234
@@ -266,20 +241,15 @@ function computeBetasAndGradKernel(probs, labelSize, uttLength,
266
241
return nothing
267
242
end
268
243
244
+ # methods for `ctc_` helper function
269
245
ctc (ŷ:: CuArray , y:: Array ) = ctc_ (ŷ, y)[1 ] |> mean
270
-
271
246
ctc (ŷ:: Array , y:: CuArray ) = ctc_ (CuArray (ŷ), collect (y))[1 ] |> mean
272
-
273
247
ctc (ŷ:: CuArray , y:: CuArray ) = ctc_ (ŷ, collect (y))[1 ] |> mean
274
-
275
- # methods for `ctc_` helper function
276
- ctc_ (ŷ:: Array , y:: CuArray ) = ctc_ (CuArray (ŷ), y)
248
+ ctc_ (ŷ:: Array , y:: CuArray ) = ctc_ (CuArray (ŷ), collect (y))
277
249
278
250
function ctc_ (ŷ:: CuArray , y)
279
251
280
252
ŷ = logsoftmax (ŷ)
281
- if any (isinf .(ŷ)) error (" Inf in yhat" ) end
282
- if any (isnan .(ŷ)) error (" NaN in yhat" ) end
283
253
284
254
blank = size (ŷ, 1 )
285
255
labels = [Base. argmax (y[:,i]) for i in 1 : size (y, 2 )]
@@ -289,23 +259,25 @@ function ctc_(ŷ::CuArray, y)
289
259
push! (z′, label)
290
260
push! (z′, blank)
291
261
end
262
+
292
263
T = size (ŷ, 2 )
293
264
U′ = 2 * length (z) + 1
294
- alphas = CUDA. fill (log (zero (ŷ[1 ])), T * U′)
295
- betas = copy (alphas)
296
- output = copy (alphas)
265
+
266
+ alphas = CUDA. fill (log (zero (ŷ[1 ])), U′, T)
267
+ betas = CUDA. fill (log (zero (ŷ[1 ])), U′, T)
268
+ output = CUDA. fill (log (zero (ŷ[1 ])), U′, T)
297
269
298
270
nRepeats = countRepeats (labels)
271
+ nThreads = min (U′, MAX_THREADS)
299
272
300
- # 1 block with `U′` threads
301
- @cuda blocks= 1 threads= U′ computeAlphaKernel (ŷ, length (z), size (ŷ,2 ), nRepeats, CuArray (z), CuArray (z′), alphas, blank)
273
+ @cuda blocks= 1 threads= nThreads computeAlphaKernel (ŷ, length (z), size (ŷ,2 ), nRepeats, CuArray (z), CuArray (z′), alphas, blank)
302
274
303
275
grads = CUDA. fill (log (zero (ŷ[1 ])), size (ŷ))
304
- accum = CUDA. fill (log (zero (ŷ[1 ])), length (ŷ))
276
+ accum = CUDA. fill (log (zero (ŷ[1 ])), size (ŷ))
305
277
306
- @cuda blocks= 1 threads= U′ computeBetasAndGradKernel (ŷ, length (z), size (ŷ,2 ), nRepeats, CuArray (z′), alphas, betas, output, accum, grads, blank)
278
+ @cuda blocks= 1 threads= nThreads computeBetasAndGradKernel (ŷ, length (z), size (ŷ,2 ), nRepeats, CuArray (z′), alphas, betas, output, accum, grads, blank)
307
279
308
- ls = reshape ( collect (output), U′, T )
280
+ ls = collect (output)
309
281
ls = vec (- 1 .* [logsum (ls[:,i]) for i in 1 : size (ls, 2 )])
310
282
311
283
ŷ = alphas = betas = output = accum = nothing
0 commit comments