-
Notifications
You must be signed in to change notification settings - Fork 1.7k
/
cuda_hefty1.cu
414 lines (343 loc) · 12.2 KB
/
cuda_hefty1.cu
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
#include <cuda.h>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
// aus cpu-miner.c
extern int device_map[8];
#include <stdio.h>
#include <memory.h>
#define USE_SHARED 1
// Folgende Definitionen später durch header ersetzen
typedef unsigned int uint32_t;
typedef unsigned char uint8_t;
typedef unsigned short uint16_t;
// diese Struktur wird in der Init Funktion angefordert
static cudaDeviceProp props;
// globaler Speicher für alle HeftyHashes aller Threads
uint32_t *d_heftyHashes[8];
/* Hash-Tabellen */
__constant__ uint32_t hefty_gpu_constantTable[64];
#if USE_SHARED
#define heftyLookUp(x) (*((uint32_t*)heftytab + (x)))
#else
#define heftyLookUp(x) hefty_gpu_constantTable[x]
#endif
// muss expandiert werden
__constant__ uint32_t hefty_gpu_blockHeader[16]; // 2x512 Bit Message
__constant__ uint32_t hefty_gpu_register[8];
__constant__ uint32_t hefty_gpu_sponge[4];
uint32_t hefty_cpu_hashTable[] = {
0x6a09e667UL,
0xbb67ae85UL,
0x3c6ef372UL,
0xa54ff53aUL,
0x510e527fUL,
0x9b05688cUL,
0x1f83d9abUL,
0x5be0cd19UL };
uint32_t hefty_cpu_constantTable[] = {
0x428a2f98UL, 0x71374491UL, 0xb5c0fbcfUL, 0xe9b5dba5UL,
0x3956c25bUL, 0x59f111f1UL, 0x923f82a4UL, 0xab1c5ed5UL,
0xd807aa98UL, 0x12835b01UL, 0x243185beUL, 0x550c7dc3UL,
0x72be5d74UL, 0x80deb1feUL, 0x9bdc06a7UL, 0xc19bf174UL,
0xe49b69c1UL, 0xefbe4786UL, 0x0fc19dc6UL, 0x240ca1ccUL,
0x2de92c6fUL, 0x4a7484aaUL, 0x5cb0a9dcUL, 0x76f988daUL,
0x983e5152UL, 0xa831c66dUL, 0xb00327c8UL, 0xbf597fc7UL,
0xc6e00bf3UL, 0xd5a79147UL, 0x06ca6351UL, 0x14292967UL,
0x27b70a85UL, 0x2e1b2138UL, 0x4d2c6dfcUL, 0x53380d13UL,
0x650a7354UL, 0x766a0abbUL, 0x81c2c92eUL, 0x92722c85UL,
0xa2bfe8a1UL, 0xa81a664bUL, 0xc24b8b70UL, 0xc76c51a3UL,
0xd192e819UL, 0xd6990624UL, 0xf40e3585UL, 0x106aa070UL,
0x19a4c116UL, 0x1e376c08UL, 0x2748774cUL, 0x34b0bcb5UL,
0x391c0cb3UL, 0x4ed8aa4aUL, 0x5b9cca4fUL, 0x682e6ff3UL,
0x748f82eeUL, 0x78a5636fUL, 0x84c87814UL, 0x8cc70208UL,
0x90befffaUL, 0xa4506cebUL, 0xbef9a3f7UL, 0xc67178f2UL
};
//#define S(x, n) (((x) >> (n)) | ((x) << (32 - (n))))
static __host__ __device__ uint32_t S(uint32_t x, int n)
{
return (((x) >> (n)) | ((x) << (32 - (n))));
}
#define R(x, n) ((x) >> (n))
#define Ch(x, y, z) ((x & (y ^ z)) ^ z)
#define Maj(x, y, z) ((x & (y | z)) | (y & z))
#define S0(x) (S(x, 2) ^ S(x, 13) ^ S(x, 22))
#define S1(x) (S(x, 6) ^ S(x, 11) ^ S(x, 25))
#define s0(x) (S(x, 7) ^ S(x, 18) ^ R(x, 3))
#define s1(x) (S(x, 17) ^ S(x, 19) ^ R(x, 10))
#define SWAB32(x) ( ((x & 0x000000FF) << 24) | ((x & 0x0000FF00) << 8) | ((x & 0x00FF0000) >> 8) | ((x & 0xFF000000) >> 24) )
// uint8_t
#define smoosh4(x) ( ((x)>>4) ^ ((x) & 0x0F) )
__host__ __forceinline__ __device__ uint8_t smoosh2(uint32_t x)
{
uint16_t w = (x >> 16) ^ (x & 0xffff);
uint8_t n = smoosh4( (uint8_t)( (w >> 8) ^ (w & 0xFF) ) );
return 24 - (((n >> 2) ^ (n & 0x03)) << 3);
}
// 4 auf einmal
#define smoosh4Quad(x) ( (((x)>>4) ^ (x)) & 0x0F0F0F0F )
#define getByte(x,y) ( ((x) >> (y)) & 0xFF )
__host__ __forceinline__ __device__ void Mangle(uint32_t *inp)
{
uint32_t r = smoosh4Quad(inp[0]);
uint32_t inp0org;
uint32_t tmp0Mask, tmp1Mask;
uint32_t in1, in2, isAddition;
uint32_t tmp;
uint8_t b;
inp[1] = inp[1] ^ S(inp[0], getByte(r, 24));
r += 0x01010101;
tmp = smoosh2(inp[1]);
b = getByte(r,tmp);
inp0org = S(inp[0], b);
tmp0Mask = -((tmp >> 3)&1); // Bit 3 an Position 0
tmp1Mask = -((tmp >> 4)&1); // Bit 4 an Position 0
in1 = (inp[2] & ~inp0org) |
(tmp1Mask & ~inp[2] & inp0org) |
(~tmp0Mask & ~inp[2] & inp0org);
in2 = inp[2] += ~inp0org;
isAddition = ~tmp0Mask & tmp1Mask;
inp[2] = isAddition ? in2 : in1;
r += 0x01010101;
tmp = smoosh2(inp[1] ^ inp[2]);
b = getByte(r,tmp);
inp0org = S(inp[0], b);
tmp0Mask = -((tmp >> 3)&1); // Bit 3 an Position 0
tmp1Mask = -((tmp >> 4)&1); // Bit 4 an Position 0
in1 = (inp[3] & ~inp0org) |
(tmp1Mask & ~inp[3] & inp0org) |
(~tmp0Mask & ~inp[3] & inp0org);
in2 = inp[3] += ~inp0org;
isAddition = ~tmp0Mask & tmp1Mask;
inp[3] = isAddition ? in2 : in1;
inp[0] ^= (inp[1] ^ inp[2]) + inp[3];
}
__host__ __forceinline__ __device__ void Absorb(uint32_t *inp, uint32_t x)
{
inp[0] ^= x;
Mangle(inp);
}
__host__ __forceinline__ __device__ uint32_t Squeeze(uint32_t *inp)
{
uint32_t y = inp[0];
Mangle(inp);
return y;
}
__host__ __forceinline__ __device__ uint32_t Br(uint32_t *sponge, uint32_t x)
{
uint32_t r = Squeeze(sponge);
uint32_t t = ((r >> 8) & 0x1F);
uint32_t y = 1 << t;
uint32_t a = (((r>>1) & 0x01) << t) & y;
uint32_t b = ((r & 0x01) << t) & y;
uint32_t c = x & y;
uint32_t retVal = (x & ~y) | (~b & c) | (a & ~c);
return retVal;
}
__forceinline__ __device__ void hefty_gpu_round(uint32_t *regs, uint32_t W, uint32_t K, uint32_t *sponge)
{
uint32_t tmpBr;
uint32_t brG = Br(sponge, regs[6]);
uint32_t brF = Br(sponge, regs[5]);
uint32_t tmp1 = Ch(regs[4], brF, brG) + regs[7] + W + K;
uint32_t brE = Br(sponge, regs[4]);
uint32_t tmp2 = tmp1 + S1(brE);
uint32_t brC = Br(sponge, regs[2]);
uint32_t brB = Br(sponge, regs[1]);
uint32_t brA = Br(sponge, regs[0]);
uint32_t tmp3 = Maj(brA, brB, brC);
tmpBr = Br(sponge, regs[0]);
uint32_t tmp4 = tmp3 + S0(tmpBr);
tmpBr = Br(sponge, tmp2);
#pragma unroll 7
for (int k=6; k >= 0; k--) regs[k+1] = regs[k];
regs[0] = tmp2 + tmp4;
regs[4] += tmpBr;
}
__host__ void hefty_cpu_round(uint32_t *regs, uint32_t W, uint32_t K, uint32_t *sponge)
{
uint32_t tmpBr;
uint32_t brG = Br(sponge, regs[6]);
uint32_t brF = Br(sponge, regs[5]);
uint32_t tmp1 = Ch(regs[4], brF, brG) + regs[7] + W + K;
uint32_t brE = Br(sponge, regs[4]);
uint32_t tmp2 = tmp1 + S1(brE);
uint32_t brC = Br(sponge, regs[2]);
uint32_t brB = Br(sponge, regs[1]);
uint32_t brA = Br(sponge, regs[0]);
uint32_t tmp3 = Maj(brA, brB, brC);
tmpBr = Br(sponge, regs[0]);
uint32_t tmp4 = tmp3 + S0(tmpBr);
tmpBr = Br(sponge, tmp2);
for (int k=6; k >= 0; k--) regs[k+1] = regs[k];
regs[0] = tmp2 + tmp4;
regs[4] += tmpBr;
}
// Die Hash-Funktion
__global__ void hefty_gpu_hash(int threads, uint32_t startNounce, void *outputHash)
{
#if USE_SHARED
extern __shared__ char heftytab[];
if(threadIdx.x < 64)
{
*((uint32_t*)heftytab + threadIdx.x) = hefty_gpu_constantTable[threadIdx.x];
}
__syncthreads();
#endif
int thread = (blockDim.x * blockIdx.x + threadIdx.x);
if (thread < threads)
{
// bestimme den aktuellen Zähler
uint32_t nounce = startNounce + thread;
// jeder thread in diesem Block bekommt sein eigenes W Array im Shared memory
// reduktion von 256 byte auf 128 byte
uint32_t W1[16];
uint32_t W2[16];
// Initialisiere die register a bis h mit der Hash-Tabelle
uint32_t regs[8];
uint32_t hash[8];
uint32_t sponge[4];
#pragma unroll 4
for(int k=0; k < 4; k++)
sponge[k] = hefty_gpu_sponge[k];
// pre
#pragma unroll 8
for (int k=0; k < 8; k++)
{
regs[k] = hefty_gpu_register[k];
hash[k] = regs[k];
}
//memcpy(W, &hefty_gpu_blockHeader[0], sizeof(uint32_t) * 16); // verbleibende 20 bytes aus Block 2 plus padding
#pragma unroll 16
for(int k=0;k<16;k++)
W1[k] = hefty_gpu_blockHeader[k];
W1[3] = SWAB32(nounce);
// 2. Runde
#pragma unroll 16
for(int j=0;j<16;j++)
Absorb(sponge, W1[j] ^ heftyLookUp(j));
// Progress W1 (Bytes 0...63)
#pragma unroll 16
for(int j=0;j<16;j++)
{
Absorb(sponge, regs[3] ^ regs[7]);
hefty_gpu_round(regs, W1[j], heftyLookUp(j), sponge);
}
// Progress W2 (Bytes 64...127) then W3 (Bytes 128...191) ...
#pragma unroll 3
for(int k=0;k<3;k++)
{
#pragma unroll 2
for(int j=0;j<2;j++)
W2[j] = s1(W1[14+j]) + W1[9+j] + s0(W1[1+j]) + W1[j];
#pragma unroll 5
for(int j=2;j<7;j++)
W2[j] = s1(W2[j-2]) + W1[9+j] + s0(W1[1+j]) + W1[j];
#pragma unroll 8
for(int j=7;j<15;j++)
W2[j] = s1(W2[j-2]) + W2[j-7] + s0(W1[1+j]) + W1[j];
W2[15] = s1(W2[13]) + W2[8] + s0(W2[0]) + W1[15];
#pragma unroll 16
for(int j=0;j<16;j++)
{
Absorb(sponge, regs[3] + regs[7]);
hefty_gpu_round(regs, W2[j], heftyLookUp(j + 16 * (k+1)), sponge);
}
#pragma unroll 16
for(int j=0;j<16;j++)
W1[j] = W2[j];
}
#pragma unroll 8
for(int k=0;k<8;k++)
hash[k] += regs[k];
#pragma unroll 8
for(int k=0;k<8;k++)
((uint32_t*)outputHash)[8*thread+k] = SWAB32(hash[k]);
}
}
// Setup-Funktionen
__host__ void hefty_cpu_init(int thr_id, int threads)
{
cudaSetDevice(device_map[thr_id]);
cudaGetDeviceProperties(&props, device_map[thr_id]);
// Kopiere die Hash-Tabellen in den GPU-Speicher
cudaMemcpyToSymbol( hefty_gpu_constantTable,
hefty_cpu_constantTable,
sizeof(uint32_t) * 64 );
// Speicher für alle Hefty1 hashes belegen
cudaMalloc(&d_heftyHashes[thr_id], 8 * sizeof(uint32_t) * threads);
}
__host__ void hefty_cpu_setBlock(int thr_id, int threads, void *data)
// data muss 84-Byte haben!
{
// Nachricht expandieren und setzen
uint32_t msgBlock[32];
memset(msgBlock, 0, sizeof(uint32_t) * 32);
memcpy(&msgBlock[0], data, 84);
msgBlock[21] |= 0x80;
msgBlock[31] = 672; // bitlen
for(int i=0;i<31;i++) // Byteorder drehen
msgBlock[i] = SWAB32(msgBlock[i]);
// die erste Runde wird auf der CPU durchgeführt, da diese für
// alle Threads gleich ist. Der Hash wird dann an die Threads
// übergeben
// Erstelle expandierten Block W
uint32_t W[64];
memcpy(W, &msgBlock[0], sizeof(uint32_t) * 16);
for(int j=16;j<64;j++)
W[j] = s1(W[j-2]) + W[j-7] + s0(W[j-15]) + W[j-16];
// Initialisiere die register a bis h mit der Hash-Tabelle
uint32_t regs[8];
uint32_t hash[8];
uint32_t sponge[4];
// pre
memset(sponge, 0, sizeof(uint32_t) * 4);
for (int k=0; k < 8; k++)
{
regs[k] = hefty_cpu_hashTable[k];
hash[k] = regs[k];
}
// 1. Runde
for(int j=0;j<16;j++)
Absorb(sponge, W[j] ^ hefty_cpu_constantTable[j]);
for(int j=0;j<16;j++)
{
Absorb(sponge, regs[3] ^ regs[7]);
hefty_cpu_round(regs, W[j], hefty_cpu_constantTable[j], sponge);
}
for(int j=16;j<64;j++)
{
Absorb(sponge, regs[3] + regs[7]);
hefty_cpu_round(regs, W[j], hefty_cpu_constantTable[j], sponge);
}
for(int k=0;k<8;k++)
hash[k] += regs[k];
// sponge speichern
cudaMemcpyToSymbol( hefty_gpu_sponge,
sponge,
sizeof(uint32_t) * 4 );
// hash speichern
cudaMemcpyToSymbol( hefty_gpu_register,
hash,
sizeof(uint32_t) * 8 );
// Blockheader setzen (korrekte Nonce fehlt da drin noch)
cudaMemcpyToSymbol( hefty_gpu_blockHeader,
&msgBlock[16],
64);
}
__host__ void hefty_cpu_hash(int thr_id, int threads, int startNounce)
{
// Compute 3.x und 5.x Geräte am besten mit 768 Threads ansteuern,
// alle anderen mit 512 Threads.
int threadsperblock = (props.major >= 3) ? 768 : 512;
// berechne wie viele Thread Blocks wir brauchen
dim3 grid((threads + threadsperblock-1)/threadsperblock);
dim3 block(threadsperblock);
// Größe des dynamischen Shared Memory Bereichs
#if USE_SHARED
size_t shared_size = 8 * 64 * sizeof(uint32_t);
#else
size_t shared_size = 0;
#endif
// fprintf(stderr, "threads=%d, %d blocks, %d threads per block, %d bytes shared\n", threads, grid.x, block.x, shared_size);
hefty_gpu_hash<<<grid, block, shared_size>>>(threads, startNounce, (void*)d_heftyHashes[thr_id]);
}