This repository has been archived by the owner on Jan 13, 2023. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 0
/
kernel.cl
356 lines (308 loc) · 11 KB
/
kernel.cl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
// Taken from BLAKE3 reference implementation
// https://github.com/BLAKE3-team/BLAKE3/blob/da4c792d8094f35c05c41c9aeb5dfe4aa67ca1ac/reference_impl/reference_impl.rs#L40
constant const size_t MSG_PERMUTATION[16] = { 2, 6, 3, 10, 7, 0, 4, 13,
1, 11, 12, 5, 9, 14, 15, 8 };
// Taken from BLAKE3 reference implementation
// https://github.com/BLAKE3-team/BLAKE3/blob/da4c792d8094f35c05c41c9aeb5dfe4aa67ca1ac/reference_impl/reference_impl.rs#L36-L38
constant const uint IV[8] = { 0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A,
0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19 };
// BLAKE3 constants
// https://github.com/BLAKE3-team/BLAKE3/blob/da4c792d8094f35c05c41c9aeb5dfe4aa67ca1ac/reference_impl/reference_impl.rs#L23-L34
constant const size_t OUT_LEN = 32;
constant const size_t ROUNDS = 7;
constant const uint BLOCK_LEN = 64;
constant const uint CHUNK_START = 1 << 0;
constant const uint CHUNK_END = 1 << 1;
constant const uint PARENT = 1 << 2;
constant const uint ROOT = 1 << 3;
// Permutes input message words using a same-sized temporary array ( 64 -bytes
// ), as per permutation index provided to kernel in constant memory
//
// See
// https://github.com/BLAKE3-team/BLAKE3/blob/da4c792d8094f35c05c41c9aeb5dfe4aa67ca1ac/reference_impl/reference_impl.rs#L67-L73
void
permute(
#if defined(LE_BYTES_TO_WORDS) && defined(WORDS_TO_LE_BYTES)
private uint* const msg
#else
global uint* const msg
#endif
)
{
private
uint permuted[16];
// expecting this loop to be fully unrolled !
// when target device supports opencl c 2.0 minimum
//
// see
// https://www.khronos.org/registry/OpenCL/specs/3.0-unified/html/OpenCL_C.html#specifying-attribute-for-unrolling-loops
#if __OPENCL_C_VERSION__ >= CL_VERSION_2_0
__attribute__((opencl_unroll_hint(16)))
#endif
for (size_t i = 0; i < 16; i++)
{
permuted[i] = msg[MSG_PERMUTATION[i]];
}
// expecting this loop to be fully unrolled !
// when target device supports opencl c 2.0 minimum
//
// see
// https://www.khronos.org/registry/OpenCL/specs/3.0-unified/html/OpenCL_C.html#specifying-attribute-for-unrolling-loops
#if __OPENCL_C_VERSION__ >= CL_VERSION_2_0
__attribute__((opencl_unroll_hint(16)))
#endif
for (size_t i = 0; i < 16; i++)
{
msg[i] = permuted[i];
}
}
// A mixing round of blake3, where 64 -bytes input message is mixed with
// hash state ( both column-wise & diagonally )
//
// In blake3 this function will be invoked 7 times for 64 -bytes input mixing !
//
// See
// https://github.com/BLAKE3-team/BLAKE3/blob/da4c792d8094f35c05c41c9aeb5dfe4aa67ca1ac/reference_impl/reference_impl.rs#L54-L65
//
// Note, as this implementation is manually vectorised, 4x4 state matrix is
// required to be diagonalised before applying diagonal mixing stage & also
// after diagonal processing state matrix needs to be undiagonalised so that
// next round of mixing can be applied properly !
#ifndef TO_IL
inline
#endif
void
blake3_round(private uint4* const state,
#if defined(LE_BYTES_TO_WORDS) && defined(WORDS_TO_LE_BYTES)
private const uint* msg
#else
global const uint* msg
#endif
)
{
const uint4 mx = (uint4)(msg[0], msg[2], msg[4], msg[6]);
const uint4 my = (uint4)(msg[1], msg[3], msg[5], msg[7]);
const uint4 mz = (uint4)(msg[8], msg[10], msg[12], msg[14]);
const uint4 mw = (uint4)(msg[9], msg[11], msg[13], msg[15]);
const uint4 rrot_16 = (uint4)(16);
const uint4 rrot_12 = (uint4)(20);
const uint4 rrot_8 = (uint4)(24);
const uint4 rrot_7 = (uint4)(25);
// column-wise mixing
state[0] = state[0] + state[1] + mx;
state[3] = rotate(state[3] ^ state[0], rrot_16);
state[2] = state[2] + state[3];
state[1] = rotate(state[1] ^ state[2], rrot_12);
state[0] = state[0] + state[1] + my;
state[3] = rotate(state[3] ^ state[0], rrot_8);
state[2] = state[2] + state[3];
state[1] = rotate(state[1] ^ state[2], rrot_7);
// state matrix diagonalization
state[1] = state[1].yzwx;
state[2] = state[2].zwxy;
state[3] = state[3].wxyz;
// row-wise mixing
state[0] = state[0] + state[1] + mz;
state[3] = rotate(state[3] ^ state[0], rrot_16);
state[2] = state[2] + state[3];
state[1] = rotate(state[1] ^ state[2], rrot_12);
state[0] = state[0] + state[1] + mw;
state[3] = rotate(state[3] ^ state[0], rrot_8);
state[2] = state[2] + state[3];
state[1] = rotate(state[1] ^ state[2], rrot_7);
// state matrix un-diagonalization
state[1] = state[1].wxyz;
state[2] = state[2].zwxy;
state[3] = state[3].yzwx;
}
// Given input message of 64 -bytes, this function should be producing
// 32 -bytes output chaining value, compressing whole input inside 64 -bytes
// blake3 hash state
//
// This 32 bytes output chaining value is nothing but blake3 digest
// of 64 -bytes input
//
// Note, usually, you'll see compress( ... ) of form
// https://github.com/BLAKE3-team/BLAKE3/blob/da4c792d8094f35c05c41c9aeb5dfe4aa67ca1ac/reference_impl/reference_impl.rs#L75-L81
// but here I'm only interested in 2-to-1 hashing, meaning two 32 -bytes blake3
// digests to be provided as input and I should be producing 32 -bytes output
// digest
//
// So there is only one chunk with only one block inside itself, which is both
// CHUNK_START, CHUNK_END and ROOT
void
compress(
#if defined(LE_BYTES_TO_WORDS) && defined(WORDS_TO_LE_BYTES)
private uint* const msg,
#else
global uint* const msg,
#endif
ulong counter,
uint block_len,
uint flags,
#if defined(LE_BYTES_TO_WORDS) && defined(WORDS_TO_LE_BYTES)
private uint* const out_cv
#else
global uint* const out_cv
#endif
)
{
private
uint4 state[4] = { (uint4)(IV[0], IV[1], IV[2], IV[3]),
(uint4)(IV[4], IV[5], IV[6], IV[7]),
(uint4)(IV[0], IV[1], IV[2], IV[3]),
(uint4)((uint)(counter & 0xffffffff),
(uint)(counter >> 32),
block_len,
flags) };
// round 1
blake3_round(state, msg);
permute(msg);
// round 2
blake3_round(state, msg);
permute(msg);
// round 3
blake3_round(state, msg);
permute(msg);
// round 4
blake3_round(state, msg);
permute(msg);
// round 5
blake3_round(state, msg);
permute(msg);
// round 6
blake3_round(state, msg);
permute(msg);
// round 7
blake3_round(state, msg);
// preparing 32 -bytes output chaining value
state[0] ^= state[2];
state[1] ^= state[3];
// note, I'm skipping
// https://github.com/BLAKE3-team/BLAKE3/blob/da4c792d8094f35c05c41c9aeb5dfe4aa67ca1ac/reference_impl/reference_impl.rs#L118
// because it doesn't any how dictate what output chaining value will be
// writing output chaining value
vstore4(state[0], 0, out_cv);
vstore4(state[1], 1, out_cv);
// indexing into vector lanes like
// https://www.khronos.org/registry/OpenCL/specs/3.0-unified/html/OpenCL_C.html#vector-components
// doesn't seem to perform well
}
// Given a byte array, 4 consequtive little endian bytes to be interpreted as
// unsigned integer of width 32 -bit
//
// Inpired from
// https://doc.rust-lang.org/std/primitive.u32.html#method.from_le_bytes
void
words_from_le_bytes(global const uchar* input, private uint* const msg_words)
{
// partially unroll following loop
// when target device supports opencl c 2.0 minimum
//
// see
// https://www.khronos.org/registry/OpenCL/specs/3.0-unified/html/OpenCL_C.html#specifying-attribute-for-unrolling-loops
#if __OPENCL_C_VERSION__ >= CL_VERSION_2_0
__attribute__((opencl_unroll_hint(8)))
#endif
for (size_t i = 0; i < 16; i++)
{ // loop should be partially unrolled !
*(msg_words + i) = ((uint) * (input + i * 4 + 3) << 24) |
((uint) * (input + i * 4 + 2) << 16) |
((uint) * (input + i * 4 + 1) << 8) |
((uint) * (input + i * 4 + 0) << 0);
}
}
// Given an array of 32 -bit integers, converts each `uint` to
// four little endian bytes
//
// Inspired from Rust's
// https://doc.rust-lang.org/std/primitive.u32.html#method.to_le_bytes
void
words_to_le_bytes(private const uint* msg_words, global uchar* const output)
{
// fully parallelize following loop
// when target device supports opencl c 2.0 minimum
//
// see
// https://www.khronos.org/registry/OpenCL/specs/3.0-unified/html/OpenCL_C.html#specifying-attribute-for-unrolling-loops
#if __OPENCL_C_VERSION__ >= CL_VERSION_2_0
__attribute__((opencl_unroll_hint(8)))
#endif
for (size_t i = 0; i < 8; i++)
{
const uint num = *(msg_words + i);
*(output + i * 4 + 0) = (uchar)(num >> 0) & 0xff;
*(output + i * 4 + 1) = (uchar)(num >> 8) & 0xff;
*(output + i * 4 + 2) = (uchar)(num >> 16) & 0xff;
*(output + i * 4 + 3) = (uchar)(num >> 24) & 0xff;
}
}
// Given input byte array/ `uint` array converted from little endian bytes
// this function should be computing 2-to-1 blake3 hash i.e. 64 -bytes input
// is converted to 32 -bytes output digest, where 64 -bytes input is nothing
// but two blake3 digests concatenated to each other
//
// Just wrapper on `compress( ... )` function above
//
// You may want to note, which flags are being set before hashing
// 64 -bytes message. As this is only chunk with only block inside itself
// both CHUNK_START, CHUNK_START are required. Note, ROOT flag is also set
// because BLAKE3 merkle tree has only one node, which is obviously root node !
#if defined(EXPOSE_BLAKE3_HASH)
kernel
#else
#ifndef TO_IL
inline
#endif
#endif
void
hash(
#if defined(LE_BYTES_TO_WORDS) && defined(WORDS_TO_LE_BYTES)
global const uchar* input,
global uchar* const output
#else
global uint* const input,
global uint* const output
#endif
)
{
#if defined(LE_BYTES_TO_WORDS) && defined(WORDS_TO_LE_BYTES)
private
uint msg_words[16];
private
uint out_cv[8];
words_from_le_bytes(input, msg_words);
compress(msg_words, 0, BLOCK_LEN, CHUNK_START | CHUNK_END | ROOT, out_cv);
words_to_le_bytes(out_cv, output);
#else
compress(input, 0, BLOCK_LEN, CHUNK_START | CHUNK_END | ROOT, output);
#endif
}
// Each work-item of this kernel computes 2-to-1 blake3 hash, where 64 -bytes
// input is read from global memory and 32 -bytes output digest is written to
// global memory
//
// It's possible to pass same on-device buffer as `input` and `output`
// but it's guaranteed that two same memory location will not be accessed by two
// work-items at same time !
//
// For this reason, passing input memory offset and output memory offset using
// constant memory which will help indexing into two non-overlapping regions of
// same buffer
//
// Also note, you can always pass sub-buffers, but that will require respecting
// alignment requirements of accelerator device !
#if !(defined(LE_BYTES_TO_WORDS) && defined(WORDS_TO_LE_BYTES))
kernel void
merklize(global uint* const restrict input,
constant size_t* restrict i_offset,
global uint* const restrict output,
constant size_t* restrict o_offset)
{
private
const size_t idx = get_global_id(0);
// idx << 4 => because input being hashed is 64 -bytes wide
// idx << 3 => because output of blake3 hash is 32 -bytes wide
hash(input + *i_offset + (idx << 4), output + *o_offset + (idx << 3));
}
#endif