Skip to content

Commit fa6ea80

Browse files
committed
Change alphas to multi-dim indexing
Change probs to multi-dim indexing in alpha kernel Change probs to multi-dim indexing in beta kernel Change beta coefficients to multi-dim indexing Change output to multi-dim indexing Update accum to multi-dim indexing Update gpu kernel to multi-dim indexing
1 parent b9072d2 commit fa6ea80

File tree

1 file changed

+40
-68
lines changed

1 file changed

+40
-68
lines changed

src/losses/ctc-gpu.jl

Lines changed: 40 additions & 68 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,8 @@
77
using Flux
88
using Statistics
99
using CUDA
10-
using DelimitedFiles
10+
11+
const MAX_THREADS = 256
1112

1213
function log_plus_f(p1, p2)
1314

@@ -51,24 +52,21 @@ function computeAlphaKernel(probs, labelSize, uttLength, repeats, labelsWithoutB
5152
# Fill in first column (time step)
5253
i = tid
5354
while i <= last - start
54-
alpha[start + i] = probs[labels[start + i]]
55+
alpha[start+i, 1] = probs[labels[start+i], 1]
5556
i += blockDim().x
5657
end
5758

5859
sync_threads()
5960

6061
# Fill in coefficients for each time step
6162
for t=2:T
62-
startCurCol = (t-1) * S
63-
startPrevCol = (t-2) * S
64-
startProbCol = (t-1) * div(length(probs), T)
6563

6664
# Corner-case checking
6765
if tid == 1 && !(1 < S - 2*(T-t) - 1)
6866
if start == 0
69-
alpha[startCurCol + 1] = probs[startProbCol + blankLabel] + alpha[startPrevCol + 1]
67+
alpha[1, t] = probs[blankLabel, t] + alpha[1, t-1]
7068
elseif start == 1
71-
alpha[startCurCol + 1] = alpha[startPrevCol + 1]
69+
alpha[1, t] = alpha[1, t-1]
7270
end
7371
end
7472

@@ -79,16 +77,16 @@ function computeAlphaKernel(probs, labelSize, uttLength, repeats, labelsWithoutB
7977
idx = tid+1
8078
while idx <= S
8179

82-
prevSum = log_plus_f(alpha[startPrevCol + idx], alpha[startPrevCol + idx-1])
80+
prevSum = log_plus_f(alpha[idx, t-1], alpha[idx-1, t-1])
8381

8482
if labels[idx] != blankLabel && idx != 2 && labels[idx] != labels[idx-2]
85-
prevSum = log_plus_f(prevSum, alpha[startPrevCol + idx-2])
83+
prevSum = log_plus_f(prevSum, alpha[idx-2, t-1])
8684
end
8785

8886
if idx < S - 2*(T-t) - 1
89-
alpha[idx + startCurCol] = -Inf32
87+
alpha[idx, t] = -Inf32
9088
else
91-
alpha[startCurCol + idx] = prevSum + probs[startProbCol + labels[idx]]
89+
alpha[idx, t] = prevSum + probs[labels[idx], t]
9290
end
9391

9492
idx += blockDim().x
@@ -122,52 +120,40 @@ function computeBetasAndGradKernel(probs, labelSize, uttLength,
122120

123121
sync_threads()
124122

125-
126-
startCurCol = (T-1)*S
127-
startProbCol = (T-1) * div(length(probs), T)
128-
129123
i = tid
130124

131125
# Calculate coefficients for last column (time step)
132126
# then determine alpha and beta product
133127
while i <= last - start + 1
134-
135-
beta[startCurCol + i + start] = 0
136-
output[startCurCol + i + start] = beta[startCurCol + i + start] + alphas[startCurCol + i + start]
128+
beta[i+start, T] = 0
129+
output[i+start, T] = beta[i+start, T] + alphas[i+start, T]
137130
i += blockDim().x
138131
end
139132

140133
sync_threads()
141134

142135
# Fill in `accum` for last column (time step)
143-
if tid == 1
144-
startAccCol = startProbCol
145-
startOutputCol = startCurCol
146-
136+
if tid == 1
147137
for i=1:S
148138
labelIdx = labels[i]
149-
accum[startAccCol + labelIdx] = log_plus_f(accum[startAccCol + labelIdx], output[startOutputCol + i])
139+
accum[labelIdx, T] = log_plus_f(accum[labelIdx, T], output[i, T])
150140
end
151141
end
152142

153143
sync_threads()
154144

155145
# Fill in `grad` for last column (time step)
156146
idx = tid
157-
# while idx <= CUDA.div_fast(Float32(length(grad)), Float32(T))
158147
while idx <= size(grad, 1)
159-
#
160-
startProbCol = (T - 1) * div(length(probs), T)
161-
startOutputCol = (T - 1) * S
162148

163149
s = -Inf32
150+
164151
for i=1:S
165-
s = log_plus_f(s, output[startOutputCol + i])
152+
s = log_plus_f(s, output[i, T])
166153
end
167154

168155
# ∂L/∂a (where a is activation before logsoftmax)
169-
# grad[startProbCol + idx] = CUDA.exp(probs[startProbCol + idx]) - CUDA.exp(accum[startProbCol + idx] - s)
170-
grad[idx, T] = CUDA.exp(probs[startProbCol + idx]) - CUDA.exp(accum[startProbCol + idx] - s)
156+
grad[idx, T] = CUDA.exp(probs[idx, T]) - CUDA.exp(accum[idx, T] - s)
171157
idx += blockDim().x
172158
end
173159

@@ -176,28 +162,23 @@ function computeBetasAndGradKernel(probs, labelSize, uttLength,
176162
# Fill in the rest of the coefficients
177163
t = T-1
178164
while t >= 1
179-
180-
startCurCol = (t-1)*S
181-
startNextCol = t*S
182-
startProbCol = t * div(length(probs), T)
183-
184165
if t < T
185166

186167
idx = tid
187168
while idx <= S-1
188169

189-
nextSum = log_plus_f(beta[startNextCol + idx] + probs[startProbCol + labels[idx]],
190-
beta[startNextCol + idx+1] + probs[startProbCol + labels[idx+1]])
170+
nextSum = log_plus_f(beta[idx, t+1] + probs[labels[idx], t+1],
171+
beta[idx+1, t+1] + probs[labels[idx+1], t+1])
191172

192173
if labels[idx] != blankLabel && idx != S-1 && labels[idx] != labels[idx+2]
193174
nextSum = log_plus_f(nextSum,
194-
beta[startNextCol + idx + 2] + probs[startProbCol + labels[idx+2]])
175+
beta[idx + 2, t+1] + probs[labels[idx+2], t+1])
195176
end
196177

197178
if idx > 2*t
198-
beta[idx + startCurCol] = -Inf32
179+
beta[idx, t] = -Inf32
199180
else
200-
beta[idx + startCurCol] = nextSum
181+
beta[idx, t] = nextSum
201182

202183
end
203184

@@ -207,14 +188,14 @@ function computeBetasAndGradKernel(probs, labelSize, uttLength,
207188
sync_threads()
208189

209190
if tid == 1 && last == S
210-
beta[startCurCol + S] = beta[startNextCol + S] + probs[startProbCol + blankLabel]
191+
beta[S, t] = beta[S, t] + probs[blankLabel, t+1]
211192
end
212193

213194
sync_threads()
214195

215196
idx = tid
216197
while idx <= S
217-
output[startCurCol + idx] = alphas[idx+startCurCol] + beta[startCurCol + idx]
198+
output[idx, t] = alphas[idx, t] + beta[idx, t]
218199
idx += blockDim().x
219200
end
220201

@@ -226,14 +207,10 @@ function computeBetasAndGradKernel(probs, labelSize, uttLength,
226207

227208
# Calculate accumulated alpha-beta products for each label class for
228209
# each time step; used in calculating gradients
229-
if tid == 1
230-
231-
startAccCol = (t-1) * div(length(accum), T)
232-
startOutputCol = (t-1) * S
233-
210+
if tid == 1
234211
for i=1:S
235212
labelIdx = labels[i]
236-
accum[startAccCol + labelIdx] = log_plus_f(accum[startAccCol + labelIdx], output[startOutputCol + i])
213+
accum[labelIdx, t] = log_plus_f(accum[labelIdx, t], output[i, t])
237214
end
238215
end
239216

@@ -243,17 +220,15 @@ function computeBetasAndGradKernel(probs, labelSize, uttLength,
243220

244221
# Calculate gradients
245222
while idx <= size(grad, 1)
246-
#
247-
startProbCol = (t - 1) * div(length(probs), T)
248-
startOutputCol = (t - 1) * S
249223

250224
s = -Inf32
225+
251226
for i=1:S
252-
s = log_plus_f(s, output[startOutputCol + i])
227+
s = log_plus_f(s, output[i, t])
253228
end
254229

255230
# ∂L/∂a (where a is activation before logsoftmax)
256-
grad[idx, t] = CUDA.exp(probs[startProbCol + idx]) - CUDA.exp(accum[startProbCol + idx] - s)
231+
grad[idx, t] = CUDA.exp(probs[idx, t]) - CUDA.exp(accum[idx, t] - s)
257232
idx += blockDim().x
258233
end
259234

@@ -266,20 +241,15 @@ function computeBetasAndGradKernel(probs, labelSize, uttLength,
266241
return nothing
267242
end
268243

244+
# methods for `ctc_` helper function
269245
ctc(ŷ::CuArray, y::Array) = ctc_(ŷ, y)[1] |> mean
270-
271246
ctc(ŷ::Array, y::CuArray) = ctc_(CuArray(ŷ), collect(y))[1] |> mean
272-
273247
ctc(ŷ::CuArray, y::CuArray) = ctc_(ŷ, collect(y))[1] |> mean
274-
275-
# methods for `ctc_` helper function
276-
ctc_(ŷ::Array, y::CuArray) = ctc_(CuArray(ŷ), y)
248+
ctc_(ŷ::Array, y::CuArray) = ctc_(CuArray(ŷ), collect(y))
277249

278250
function ctc_(ŷ::CuArray, y)
279251

280252
= logsoftmax(ŷ)
281-
if any(isinf.(ŷ)) error("Inf in yhat") end
282-
if any(isnan.(ŷ)) error("NaN in yhat") end
283253

284254
blank = size(ŷ, 1)
285255
labels = [Base.argmax(y[:,i]) for i in 1:size(y, 2)]
@@ -289,23 +259,25 @@ function ctc_(ŷ::CuArray, y)
289259
push!(z′, label)
290260
push!(z′, blank)
291261
end
262+
292263
T = size(ŷ, 2)
293264
U′ = 2*length(z) + 1
294-
alphas = CUDA.fill(log(zero(ŷ[1])), T * U′)
295-
betas = copy(alphas)
296-
output = copy(alphas)
265+
266+
alphas = CUDA.fill(log(zero(ŷ[1])), U′, T)
267+
betas = CUDA.fill(log(zero(ŷ[1])), U′, T)
268+
output = CUDA.fill(log(zero(ŷ[1])), U′, T)
297269

298270
nRepeats = countRepeats(labels)
271+
nThreads = min(U′, MAX_THREADS)
299272

300-
# 1 block with `U′` threads
301-
@cuda blocks=1 threads=U′ computeAlphaKernel(ŷ, length(z), size(ŷ,2), nRepeats, CuArray(z), CuArray(z′), alphas, blank)
273+
@cuda blocks=1 threads=nThreads computeAlphaKernel(ŷ, length(z), size(ŷ,2), nRepeats, CuArray(z), CuArray(z′), alphas, blank)
302274

303275
grads = CUDA.fill(log(zero(ŷ[1])), size(ŷ))
304-
accum = CUDA.fill(log(zero(ŷ[1])), length(ŷ))
276+
accum = CUDA.fill(log(zero(ŷ[1])), size(ŷ))
305277

306-
@cuda blocks=1 threads=U′ computeBetasAndGradKernel(ŷ, length(z), size(ŷ,2), nRepeats, CuArray(z′), alphas, betas, output, accum, grads, blank)
278+
@cuda blocks=1 threads=nThreads computeBetasAndGradKernel(ŷ, length(z), size(ŷ,2), nRepeats, CuArray(z′), alphas, betas, output, accum, grads, blank)
307279

308-
ls = reshape(collect(output), U′, T)
280+
ls = collect(output)
309281
ls = vec(-1 .* [logsum(ls[:,i]) for i in 1:size(ls, 2)])
310282

311283
= alphas = betas = output = accum = nothing

0 commit comments

Comments
 (0)