This repository was archived by the owner on Jul 4, 2022. It is now read-only.
forked from hashcat/hashcat
-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathinc_cipher_rc4.cl
329 lines (238 loc) · 8.48 KB
/
inc_cipher_rc4.cl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
#include "inc_vendor.h"
#include "inc_types.h"
#include "inc_platform.h"
#include "inc_common.h"
#include "inc_cipher_rc4.h"
#ifdef IS_CPU
// Pattern linear
DECLSPEC u8 GET_KEY8 (LOCAL_AS u32 *S, const u8 k, MAYBE_UNUSED const u64 lid)
{
LOCAL_AS u8 *S8 = (LOCAL_AS u8 *) S;
return S8[k];
}
DECLSPEC void SET_KEY8 (LOCAL_AS u32 *S, const u8 k, const u8 v, MAYBE_UNUSED const u64 lid)
{
LOCAL_AS u8 *S8 = (LOCAL_AS u8 *) S;
S8[k] = v;
}
DECLSPEC void SET_KEY32 (LOCAL_AS u32 *S, const u8 k, const u32 v, MAYBE_UNUSED const u64 lid)
{
S[k] = v;
}
#else
// The goal of this pattern is to have the minimum shared memory bank conflicts as possible.
// Bank conflicts force the device to serialize the bank access and this results in performance drops.
//
// Good to know:
// NV and AMD GPU both have exactly 32 shared memory banks (at least on all modern GPU).
// These banks can't be addressed directly, but indirectly.
// Each of the 32 banks add some space to the total LOCAL buffer.
// But this space is not simply appended, but in chunks of 4 bytes:
// Bank 0 provides bytes 0..3, Bank 1 provides bytes 4..7, Bank 2 provides 8..11, and so on..
//
// We design the memory structure that each thread ID aligns with the corresponding bank ID.
// If a thread always access the same bank, then there are no bank conflicts and we reach our goal.
//
// Since we have 32 banks, we ideally operate on 32 threads.
// For NV GPU this aligns perfectly, because native threads = 32.
// For AMD GPU it does not, because native threads = 64. But we can reduce it to only 1 bank conflict per thread.
//
// The size for the S[] buffer for each thread is 256 byte, basically just the RC4 sbox.
// We want to assign 1 thread to 1 bank, so for 32 banks the total size is 8192 bytes (256 * 32 = 8192):
// LOCAL_VK u32 S[64 * FIXED_LOCAL_SIZE];
// Note that sizeof (u32) * 64 = 256 and then multiplied with the thread count.
//
// Addressing:
//
// This is the first major offset and is relevant for thread ID >= 32 (AMD or non-native thread count on NV):
// (t / 32) * 8192
// The first 8192 bytes of S[] are accessed from threads 0..31 and the next 8192 bytes from threads 32..63
// We could also use more than 64 threads but we need to make sure it's a multiple of 32.
//
// Inside this window of 8192 bytes we select the bank id from the thread id:
// (t & 31) * 4
// We need to do the * 4 because of the 4 byte chunks (see top)
//
// Because of the indirect bank ID addressing we can't write from left to right, we write from top to bottom.
// To ensure each thread stays to its assigned bank id from the previous calculation we could simply do k * 128,
// because 128 = 4 (bank chunk size) * 32 (banks).
//
// However, it's not that easy. We need to find a way to enforce a chunk size of 4.
// (k / 4) * 128
//
// Finally we can select the actual target byte from (1 out of 4) from this chunk:
// (k & 3)
#define KEY8(t,k) (((k) & 3) + (((k) / 4) * 128) + (((t) & 31) * 4) + (((t) / 32) * 8192))
DECLSPEC u8 GET_KEY8 (LOCAL_AS u32 *S, const u8 k, const u64 lid)
{
LOCAL_AS u8 *S8 = (LOCAL_AS u8 *) S;
return S8[KEY8 (lid, k)];
}
DECLSPEC void SET_KEY8 (LOCAL_AS u32 *S, const u8 k, const u8 v, const u64 lid)
{
LOCAL_AS u8 *S8 = (LOCAL_AS u8 *) S;
S8[KEY8 (lid, k)] = v;
}
#define KEY32(t,k) (((k) * 32) + ((t) & 31) + (((t) / 32) * 2048))
DECLSPEC void SET_KEY32 (LOCAL_AS u32 *S, const u8 k, const u32 v, const u64 lid)
{
S[KEY32 (lid, k)] = v;
}
#undef KEY8
#undef KEY32
#endif
DECLSPEC void rc4_init_40 (LOCAL_AS u32 *S, PRIVATE_AS const u32 *key, const u64 lid)
{
u32 v = 0x03020100;
u32 a = 0x04040404;
#ifdef _unroll
#pragma unroll
#endif
for (u8 i = 0; i < 64; i++)
{
SET_KEY32 (S, i, v, lid); v += a;
}
const u8 d0 = v8a_from_v32_S (key[0]);
const u8 d1 = v8b_from_v32_S (key[0]);
const u8 d2 = v8c_from_v32_S (key[0]);
const u8 d3 = v8d_from_v32_S (key[0]);
const u8 d4 = v8a_from_v32_S (key[1]);
u8 j = 0;
#ifdef _unroll
#pragma unroll
#endif
for (u32 i = 0; i < 255; i += 5)
{
j += GET_KEY8 (S, i + 0, lid) + d0; rc4_swap (S, i + 0, j, lid);
j += GET_KEY8 (S, i + 1, lid) + d1; rc4_swap (S, i + 1, j, lid);
j += GET_KEY8 (S, i + 2, lid) + d2; rc4_swap (S, i + 2, j, lid);
j += GET_KEY8 (S, i + 3, lid) + d3; rc4_swap (S, i + 3, j, lid);
j += GET_KEY8 (S, i + 4, lid) + d4; rc4_swap (S, i + 4, j, lid);
}
j += GET_KEY8 (S, 255, lid) + d0; rc4_swap (S, 255, j, lid);
}
DECLSPEC void rc4_init_128 (LOCAL_AS u32 *S, PRIVATE_AS const u32 *key, const u64 lid)
{
u32 v = 0x03020100;
u32 a = 0x04040404;
#ifdef _unroll
#pragma unroll
#endif
for (u8 i = 0; i < 64; i++)
{
SET_KEY32 (S, i, v, lid); v += a;
}
u8 j = 0;
for (u32 i = 0; i < 16; i++)
{
u8 idx = i * 16;
u32 v;
v = key[0];
j += GET_KEY8 (S, idx, lid) + v8a_from_v32_S (v); rc4_swap (S, idx, j, lid); idx++;
j += GET_KEY8 (S, idx, lid) + v8b_from_v32_S (v); rc4_swap (S, idx, j, lid); idx++;
j += GET_KEY8 (S, idx, lid) + v8c_from_v32_S (v); rc4_swap (S, idx, j, lid); idx++;
j += GET_KEY8 (S, idx, lid) + v8d_from_v32_S (v); rc4_swap (S, idx, j, lid); idx++;
v = key[1];
j += GET_KEY8 (S, idx, lid) + v8a_from_v32_S (v); rc4_swap (S, idx, j, lid); idx++;
j += GET_KEY8 (S, idx, lid) + v8b_from_v32_S (v); rc4_swap (S, idx, j, lid); idx++;
j += GET_KEY8 (S, idx, lid) + v8c_from_v32_S (v); rc4_swap (S, idx, j, lid); idx++;
j += GET_KEY8 (S, idx, lid) + v8d_from_v32_S (v); rc4_swap (S, idx, j, lid); idx++;
v = key[2];
j += GET_KEY8 (S, idx, lid) + v8a_from_v32_S (v); rc4_swap (S, idx, j, lid); idx++;
j += GET_KEY8 (S, idx, lid) + v8b_from_v32_S (v); rc4_swap (S, idx, j, lid); idx++;
j += GET_KEY8 (S, idx, lid) + v8c_from_v32_S (v); rc4_swap (S, idx, j, lid); idx++;
j += GET_KEY8 (S, idx, lid) + v8d_from_v32_S (v); rc4_swap (S, idx, j, lid); idx++;
v = key[3];
j += GET_KEY8 (S, idx, lid) + v8a_from_v32_S (v); rc4_swap (S, idx, j, lid); idx++;
j += GET_KEY8 (S, idx, lid) + v8b_from_v32_S (v); rc4_swap (S, idx, j, lid); idx++;
j += GET_KEY8 (S, idx, lid) + v8c_from_v32_S (v); rc4_swap (S, idx, j, lid); idx++;
j += GET_KEY8 (S, idx, lid) + v8d_from_v32_S (v); rc4_swap (S, idx, j, lid); idx++;
}
}
DECLSPEC void rc4_swap (LOCAL_AS u32 *S, const u8 i, const u8 j, const u64 lid)
{
u8 tmp;
tmp = GET_KEY8 (S, i, lid);
SET_KEY8 (S, i, GET_KEY8 (S, j, lid), lid);
SET_KEY8 (S, j, tmp, lid);
}
DECLSPEC u8 rc4_next_16 (LOCAL_AS u32 *S, const u8 i, const u8 j, PRIVATE_AS const u32 *in, PRIVATE_AS u32 *out, const u64 lid)
{
u8 a = i;
u8 b = j;
#ifdef _unroll
#pragma unroll
#endif
for (int k = 0; k < 4; k++)
{
u32 xor4 = 0;
u32 tmp;
u8 idx;
a += 1;
b += GET_KEY8 (S, a, lid);
rc4_swap (S, a, b, lid);
idx = GET_KEY8 (S, a, lid) + GET_KEY8 (S, b, lid);
tmp = GET_KEY8 (S, idx, lid);
xor4 |= tmp << 0;
a += 1;
b += GET_KEY8 (S, a, lid);
rc4_swap (S, a, b, lid);
idx = GET_KEY8 (S, a, lid) + GET_KEY8 (S, b, lid);
tmp = GET_KEY8 (S, idx, lid);
xor4 |= tmp << 8;
a += 1;
b += GET_KEY8 (S, a, lid);
rc4_swap (S, a, b, lid);
idx = GET_KEY8 (S, a, lid) + GET_KEY8 (S, b, lid);
tmp = GET_KEY8 (S, idx, lid);
xor4 |= tmp << 16;
a += 1;
b += GET_KEY8 (S, a, lid);
rc4_swap (S, a, b, lid);
idx = GET_KEY8 (S, a, lid) + GET_KEY8 (S, b, lid);
tmp = GET_KEY8 (S, idx, lid);
xor4 |= tmp << 24;
out[k] = in[k] ^ xor4;
}
return b;
}
DECLSPEC u8 rc4_next_16_global (LOCAL_AS u32 *S, const u8 i, const u8 j, GLOBAL_AS const u32 *in, PRIVATE_AS u32 *out, const u64 lid)
{
u8 a = i;
u8 b = j;
#ifdef _unroll
#pragma unroll
#endif
for (int k = 0; k < 4; k++)
{
u32 xor4 = 0;
u32 tmp;
u8 idx;
a += 1;
b += GET_KEY8 (S, a, lid);
rc4_swap (S, a, b, lid);
idx = GET_KEY8 (S, a, lid) + GET_KEY8 (S, b, lid);
tmp = GET_KEY8 (S, idx, lid);
xor4 |= tmp << 0;
a += 1;
b += GET_KEY8 (S, a, lid);
rc4_swap (S, a, b, lid);
idx = GET_KEY8 (S, a, lid) + GET_KEY8 (S, b, lid);
tmp = GET_KEY8 (S, idx, lid);
xor4 |= tmp << 8;
a += 1;
b += GET_KEY8 (S, a, lid);
rc4_swap (S, a, b, lid);
idx = GET_KEY8 (S, a, lid) + GET_KEY8 (S, b, lid);
tmp = GET_KEY8 (S, idx, lid);
xor4 |= tmp << 16;
a += 1;
b += GET_KEY8 (S, a, lid);
rc4_swap (S, a, b, lid);
idx = GET_KEY8 (S, a, lid) + GET_KEY8 (S, b, lid);
tmp = GET_KEY8 (S, idx, lid);
xor4 |= tmp << 24;
out[k] = in[k] ^ xor4;
}
return b;
}