diff --git a/ccminer.vcxproj b/ccminer.vcxproj index 54fc29929d..32ebd8d8b4 100644 --- a/ccminer.vcxproj +++ b/ccminer.vcxproj @@ -472,6 +472,16 @@ copy "$(CudaToolkitBinDir)\cudart*.dll" "$(OutDir)" -Xptxas "-abi=no -v" %(AdditionalOptions) -Xptxas "-abi=no -v" %(AdditionalOptions) + + true + true + true + true + -Xptxas "-abi=no -v" %(AdditionalOptions) + -Xptxas "-abi=no -v" %(AdditionalOptions) + -Xptxas "-abi=no -v" %(AdditionalOptions) + -Xptxas "-abi=no -v" %(AdditionalOptions) + -Xptxas "-abi=no -v" %(AdditionalOptions) -Xptxas "-abi=no -v" %(AdditionalOptions) diff --git a/ccminer.vcxproj.filters b/ccminer.vcxproj.filters index bc1320d00d..8b7a596809 100644 --- a/ccminer.vcxproj.filters +++ b/ccminer.vcxproj.filters @@ -337,5 +337,8 @@ Source Files\CUDA\x11 + + Source Files\CUDA\x11 + \ No newline at end of file diff --git a/cuda_groestlcoin.cu b/cuda_groestlcoin.cu index e153e5cece..b33ff9ebc1 100644 --- a/cuda_groestlcoin.cu +++ b/cuda_groestlcoin.cu @@ -24,7 +24,6 @@ typedef unsigned long long uint64_t; // diese Struktur wird in der Init Funktion angefordert static cudaDeviceProp props; -// globaler Speicher für alle HeftyHashes aller Threads __constant__ uint32_t pTarget[8]; // Single GPU extern uint32_t *d_resultNonce[8]; diff --git a/cuda_myriadgroestl.cu b/cuda_myriadgroestl.cu index fd868ec695..062db46c21 100644 --- a/cuda_myriadgroestl.cu +++ b/cuda_myriadgroestl.cu @@ -24,7 +24,6 @@ typedef unsigned int uint32_t; // diese Struktur wird in der Init Funktion angefordert static cudaDeviceProp props; -// globaler Speicher für alle HeftyHashes aller Threads __constant__ uint32_t pTarget[8]; // Single GPU extern uint32_t *d_resultNonce[8]; diff --git a/x11/cuda_x11_simd512.cu b/x11/cuda_x11_simd512.cu index 940818e276..801910f1e7 100644 --- a/x11/cuda_x11_simd512.cu +++ b/x11/cuda_x11_simd512.cu @@ -1,9 +1,24 @@ +// Parallelisierung: +// +// FFT_8 wird 2 mal 8-fach parallel ausgeführt (in FFT_64) +// und 1 mal 16-fach parallel (in FFT_128_full) +// +// STEP8_IF und STEP8_MAJ beinhalten je zwei 8-fach parallele Operationen + +#define TPB 256 + // aus heavy.cu extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id); typedef unsigned int uint32_t; typedef unsigned long long uint64_t; +int *d_state[8]; +uint4 *d_temp4[8]; + +// texture bound to d_temp4[thr_id], for read access in Compaction kernel +texture texRef1D_128; + #define C32(x) ((uint32_t)(x ## U)) #define T32(x) ((x) & C32(0xFFFFFFFF)) @@ -23,99 +38,6 @@ const uint32_t h_IV_512[32] = { 0x09254899, 0xd699c7bc, 0x9019b6dc, 0x2b9022e4, 0x8fa14956, 0x21bf9bd3, 0xb94d0943, 0x6ffddc22 }; -__constant__ int c_FFT[256]; -const int h_FFT[256] = -{ -// this is the FFT result in revbin permuted order -4, -4, 32, -32, -60, 60, 60, -60, 101, -101, 58, -58, 112, -112, -11, 11, -92, 92, --119, 119, 42, -42, -82, 82, 32, -32, 32, -32, 121, -121, 17, -17, -47, 47, 63, --63, 107, -107, -76, 76, -119, 119, -83, 83, 126, -126, 94, -94, -23, 23, -76, -76, -47, 47, 92, -92, -117, 117, 73, -73, -53, 53, 88, -88, -80, 80, -47, 47, -5, -5, 67, -67, 34, -34, 4, -4, 87, -87, -28, 28, -70, 70, -110, 110, -18, 18, 93, --93, 51, -51, 36, -36, 118, -118, -106, 106, 45, -45, -108, 108, -44, 44, 117, --117, -121, 121, -37, 37, 65, -65, 37, -37, 40, -40, -42, 42, 91, -91, -128, 128, --21, 21, 94, -94, -98, 98, -47, 47, 28, -28, 115, -115, 16, -16, -20, 20, 122, --122, 115, -115, 46, -46, 84, -84, -127, 127, 57, -57, 127, -127, -80, 80, 24, --24, 15, -15, 29, -29, -78, 78, -126, 126, 16, -16, 52, -52, 55, -55, 110, -110, --51, 51, -120, 120, -124, 124, -24, 24, -76, 76, 26, -26, -21, 21, -64, 64, -99, -99, 85, -85, -15, 15, -120, 120, -116, 116, 85, -85, 12, -12, -24, 24, 4, -4, -79, -79, 76, -76, 23, -23, 4, -4, -108, 108, -20, 20, 73, -73, -42, 42, -7, 7, --29, 29, -123, 123, 49, -49, -96, 96, -68, 68, -112, 112, 116, -116, -24, 24, 93, --93, -125, 125, -86, 86, 117, -117, -91, 91, 42, -42, 87, -87, -117, 117, 102, -102 -}; - -__constant__ int c_P8[32][8]; -static const int h_P8[32][8] = { -{ 2, 66, 34, 98, 18, 82, 50, 114 }, -{ 6, 70, 38, 102, 22, 86, 54, 118 }, -{ 0, 64, 32, 96, 16, 80, 48, 112 }, -{ 4, 68, 36, 100, 20, 84, 52, 116 }, -{ 14, 78, 46, 110, 30, 94, 62, 126 }, -{ 10, 74, 42, 106, 26, 90, 58, 122 }, -{ 12, 76, 44, 108, 28, 92, 60, 124 }, -{ 8, 72, 40, 104, 24, 88, 56, 120 }, -{ 15, 79, 47, 111, 31, 95, 63, 127 }, -{ 13, 77, 45, 109, 29, 93, 61, 125 }, -{ 3, 67, 35, 99, 19, 83, 51, 115 }, -{ 1, 65, 33, 97, 17, 81, 49, 113 }, -{ 9, 73, 41, 105, 25, 89, 57, 121 }, -{ 11, 75, 43, 107, 27, 91, 59, 123 }, -{ 5, 69, 37, 101, 21, 85, 53, 117 }, -{ 7, 71, 39, 103, 23, 87, 55, 119 }, -{ 8, 72, 40, 104, 24, 88, 56, 120 }, -{ 4, 68, 36, 100, 20, 84, 52, 116 }, -{ 14, 78, 46, 110, 30, 94, 62, 126 }, -{ 2, 66, 34, 98, 18, 82, 50, 114 }, -{ 6, 70, 38, 102, 22, 86, 54, 118 }, -{ 10, 74, 42, 106, 26, 90, 58, 122 }, -{ 0, 64, 32, 96, 16, 80, 48, 112 }, -{ 12, 76, 44, 108, 28, 92, 60, 124 }, -{ 134, 198, 166, 230, 150, 214, 182, 246 }, -{ 128, 192, 160, 224, 144, 208, 176, 240 }, -{ 136, 200, 168, 232, 152, 216, 184, 248 }, -{ 142, 206, 174, 238, 158, 222, 190, 254 }, -{ 140, 204, 172, 236, 156, 220, 188, 252 }, -{ 138, 202, 170, 234, 154, 218, 186, 250 }, -{ 130, 194, 162, 226, 146, 210, 178, 242 }, -{ 132, 196, 164, 228, 148, 212, 180, 244 }, -}; - -__constant__ int c_Q8[32][8]; -static const int h_Q8[32][8] = { -{ 130, 194, 162, 226, 146, 210, 178, 242 }, -{ 134, 198, 166, 230, 150, 214, 182, 246 }, -{ 128, 192, 160, 224, 144, 208, 176, 240 }, -{ 132, 196, 164, 228, 148, 212, 180, 244 }, -{ 142, 206, 174, 238, 158, 222, 190, 254 }, -{ 138, 202, 170, 234, 154, 218, 186, 250 }, -{ 140, 204, 172, 236, 156, 220, 188, 252 }, -{ 136, 200, 168, 232, 152, 216, 184, 248 }, -{ 143, 207, 175, 239, 159, 223, 191, 255 }, -{ 141, 205, 173, 237, 157, 221, 189, 253 }, -{ 131, 195, 163, 227, 147, 211, 179, 243 }, -{ 129, 193, 161, 225, 145, 209, 177, 241 }, -{ 137, 201, 169, 233, 153, 217, 185, 249 }, -{ 139, 203, 171, 235, 155, 219, 187, 251 }, -{ 133, 197, 165, 229, 149, 213, 181, 245 }, -{ 135, 199, 167, 231, 151, 215, 183, 247 }, -{ 9, 73, 41, 105, 25, 89, 57, 121 }, -{ 5, 69, 37, 101, 21, 85, 53, 117 }, -{ 15, 79, 47, 111, 31, 95, 63, 127 }, -{ 3, 67, 35, 99, 19, 83, 51, 115 }, -{ 7, 71, 39, 103, 23, 87, 55, 119 }, -{ 11, 75, 43, 107, 27, 91, 59, 123 }, -{ 1, 65, 33, 97, 17, 81, 49, 113 }, -{ 13, 77, 45, 109, 29, 93, 61, 125 }, -{ 135, 199, 167, 231, 151, 215, 183, 247 }, -{ 129, 193, 161, 225, 145, 209, 177, 241 }, -{ 137, 201, 169, 233, 153, 217, 185, 249 }, -{ 143, 207, 175, 239, 159, 223, 191, 255 }, -{ 141, 205, 173, 237, 157, 221, 189, 253 }, -{ 139, 203, 171, 235, 155, 219, 187, 251 }, -{ 131, 195, 163, 227, 147, 211, 179, 243 }, -{ 133, 197, 165, 229, 149, 213, 181, 245 }, -}; - __constant__ int c_FFT128_8_16_Twiddle[128]; static const int h_FFT128_8_16_Twiddle[128] = { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, @@ -160,67 +82,7 @@ static const int h_FFT256_2_128_Twiddle[128] = { #define IF(x, y, z) ((((y) ^ (z)) & (x)) ^ (z)) #define MAJ(x, y, z) (((z) & (y)) | (((z) | (y)) & (x))) - -__device__ __forceinline__ void STEP8_IF(const uint32_t *w, const int i, const int r, const int s, uint32_t *A, const uint32_t *B, const uint32_t *C, uint32_t *D) -{ - int j; - uint32_t R[8]; -#pragma unroll 8 - for(j=0; j<8; j++) { - R[j] = ROTL32(A[j], r); - } -#pragma unroll 8 - for(j=0; j<8; j++) { - D[j] = D[j] + w[j] + IF(A[j], B[j], C[j]); - D[j] = T32(ROTL32(T32(D[j]), s) + R[j^p8_xor(i)]); - A[j] = R[j]; - } -} - -__device__ __forceinline__ void STEP8_MAJ(const uint32_t *w, const int i, const int r, const int s, uint32_t *A, const uint32_t *B, const uint32_t *C, uint32_t *D) -{ - int j; - uint32_t R[8]; -#pragma unroll 8 - for(j=0; j<8; j++) { - R[j] = ROTL32(A[j], r); - } -#pragma unroll 8 - for(j=0; j<8; j++) { - D[j] = D[j] + w[j] + MAJ(A[j], B[j], C[j]); - D[j] = T32(ROTL32(T32(D[j]), s) + R[j^p8_xor(i)]); - A[j] = R[j]; - } -} - -__device__ __forceinline__ void Round8(uint32_t A[32], const int y[256], int i, - int r, int s, int t, int u) { - int code = i<2? 185: 233; - uint32_t w[8][8]; - int a, b; - - /* - * The FFT output y is in revbin permuted order, - * but this is included in the tables P and Q - */ - -#pragma unroll 8 - for(a=0; a<8; a++) -#pragma unroll 8 - for(b=0; b<8; b++) - w[a][b] = __byte_perm( (y[c_P8[8*i+a][b]] * code), (y[c_Q8[8*i+a][b]] * code), 0x5410); - - STEP8_IF(w[0], 8*i+0, r, s, A, &A[8], &A[16], &A[24]); - STEP8_IF(w[1], 8*i+1, s, t, &A[24], A, &A[8], &A[16]); - STEP8_IF(w[2], 8*i+2, t, u, &A[16], &A[24], A, &A[8]); - STEP8_IF(w[3], 8*i+3, u, r, &A[8], &A[16], &A[24], A); - - STEP8_MAJ(w[4], 8*i+4, r, s, A, &A[8], &A[16], &A[24]); - STEP8_MAJ(w[5], 8*i+5, s, t, &A[24], A, &A[8], &A[16]); - STEP8_MAJ(w[6], 8*i+6, t, u, &A[16], &A[24], A, &A[8]); - STEP8_MAJ(w[7], 8*i+7, u, r, &A[8], &A[16], &A[24], A); -} - +#include "x11/simd_functions.cu" /********************* Message expansion ************************/ @@ -305,7 +167,7 @@ X(j) = (u-v) << (2*n); \ #undef BUTTERFLY } -__device__ __forceinline__ void FFT_16(int *y, int stripe) { +__device__ __forceinline__ void FFT_16(int *y) { /* * FFT_16 using w=2 as 16th root of unity @@ -313,115 +175,124 @@ __device__ __forceinline__ void FFT_16(int *y, int stripe) { * Output data is in revbin_permuted order. */ -#define X(i) y[stripe*i] - -#define DO_REDUCE(i) \ -X(i) = REDUCE(X(i)) - #define DO_REDUCE_FULL_S(i) \ do { \ -X(i) = REDUCE(X(i)); \ -X(i) = EXTRA_REDUCE_S(X(i)); \ +y[i] = REDUCE(y[i]); \ +y[i] = EXTRA_REDUCE_S(y[i]); \ } while(0) -#define BUTTERFLY(i,j,n) \ -do { \ -int u= X(i); \ -int v= X(j); \ -X(i) = u+v; \ -X(j) = (u-v) << n; \ -} while(0) + int u,v; - BUTTERFLY(0, 8, 0); - BUTTERFLY(1, 9, 1); - BUTTERFLY(2, 10, 2); - BUTTERFLY(3, 11, 3); - BUTTERFLY(4, 12, 4); - BUTTERFLY(5, 13, 5); - BUTTERFLY(6, 14, 6); - BUTTERFLY(7, 15, 7); - - DO_REDUCE(11); - DO_REDUCE(12); - DO_REDUCE(13); - DO_REDUCE(14); - DO_REDUCE(15); - - BUTTERFLY( 0, 4, 0); - BUTTERFLY( 1, 5, 2); - BUTTERFLY( 2, 6, 4); - BUTTERFLY( 3, 7, 6); - - BUTTERFLY( 8, 12, 0); - BUTTERFLY( 9, 13, 2); - BUTTERFLY(10, 14, 4); - BUTTERFLY(11, 15, 6); - - DO_REDUCE(5); - DO_REDUCE(7); - DO_REDUCE(13); - DO_REDUCE(15); - - BUTTERFLY( 0, 2, 0); - BUTTERFLY( 1, 3, 4); - BUTTERFLY( 4, 6, 0); - BUTTERFLY( 5, 7, 4); - - BUTTERFLY( 8, 10, 0); - BUTTERFLY(12, 14, 0); - BUTTERFLY( 9, 11, 4); - BUTTERFLY(13, 15, 4); - - - BUTTERFLY( 0, 1, 0); - BUTTERFLY( 2, 3, 0); - BUTTERFLY( 4, 5, 0); - BUTTERFLY( 6, 7, 0); - - BUTTERFLY( 8, 9, 0); - BUTTERFLY(10, 11, 0); - BUTTERFLY(12, 13, 0); - BUTTERFLY(14, 15, 0); - - DO_REDUCE_FULL_S( 0); - DO_REDUCE_FULL_S( 1); - DO_REDUCE_FULL_S( 2); - DO_REDUCE_FULL_S( 3); - DO_REDUCE_FULL_S( 4); - DO_REDUCE_FULL_S( 5); - DO_REDUCE_FULL_S( 6); - DO_REDUCE_FULL_S( 7); - DO_REDUCE_FULL_S( 8); - DO_REDUCE_FULL_S( 9); - DO_REDUCE_FULL_S(10); - DO_REDUCE_FULL_S(11); - DO_REDUCE_FULL_S(12); - DO_REDUCE_FULL_S(13); - DO_REDUCE_FULL_S(14); - DO_REDUCE_FULL_S(15); + // BUTTERFLY(0, 8, 0); + // BUTTERFLY(1, 9, 1); + // BUTTERFLY(2, 10, 2); + // BUTTERFLY(3, 11, 3); + // BUTTERFLY(4, 12, 4); + // BUTTERFLY(5, 13, 5); + // BUTTERFLY(6, 14, 6); + // BUTTERFLY(7, 15, 7); + { + u= y[0]; // 0..7 + v= y[1]; // 8..15 + y[0] = u+v; + y[1] = (u-v) << (threadIdx.x&7); + } + + // DO_REDUCE(11); + // DO_REDUCE(12); + // DO_REDUCE(13); + // DO_REDUCE(14); + // DO_REDUCE(15); + if ((threadIdx.x&7) >=3) y[1] = REDUCE(y[1]); // 11...15 + + // BUTTERFLY( 0, 4, 0); + // BUTTERFLY( 1, 5, 2); + // BUTTERFLY( 2, 6, 4); + // BUTTERFLY( 3, 7, 6); + { + u= __shfl((int)y[0], (threadIdx.x&3),8); // 0,1,2,3 0,1,2,3 + v= __shfl((int)y[0],4+(threadIdx.x&3),8); // 4,5,6,7 4,5,6,7 + y[0] = ((threadIdx.x&7) < 4) ? (u+v) : ((u-v) << (2*(threadIdx.x&3))); + } + + // BUTTERFLY( 8, 12, 0); + // BUTTERFLY( 9, 13, 2); + // BUTTERFLY(10, 14, 4); + // BUTTERFLY(11, 15, 6); + { + u= __shfl((int)y[1], (threadIdx.x&3),8); // 8,9,10,11 8,9,10,11 + v= __shfl((int)y[1],4+(threadIdx.x&3),8); // 12,13,14,15 12,13,14,15 + y[1] = ((threadIdx.x&7) < 4) ? (u+v) : ((u-v) << (2*(threadIdx.x&3))); + } + + // DO_REDUCE(5); + // DO_REDUCE(7); + // DO_REDUCE(13); + // DO_REDUCE(15); + if ((threadIdx.x&1) && (threadIdx.x&7) >= 4) { + y[0] = REDUCE(y[0]); // 5, 7 + y[1] = REDUCE(y[1]); // 13, 15 + } + + // BUTTERFLY( 0, 2, 0); + // BUTTERFLY( 1, 3, 4); + // BUTTERFLY( 4, 6, 0); + // BUTTERFLY( 5, 7, 4); + { + u= __shfl((int)y[0], (threadIdx.x&5),8); // 0,1,0,1 4,5,4,5 + v= __shfl((int)y[0],2+(threadIdx.x&5),8); // 2,3,2,3 6,7,6,7 + y[0] = ((threadIdx.x&3) < 2) ? (u+v) : ((u-v) << (4*(threadIdx.x&1))); + } + + // BUTTERFLY( 8, 10, 0); + // BUTTERFLY( 9, 11, 4); + // BUTTERFLY(12, 14, 0); + // BUTTERFLY(13, 15, 4); + { + u= __shfl((int)y[1], (threadIdx.x&5),8); // 8,9,8,9 12,13,12,13 + v= __shfl((int)y[1],2+(threadIdx.x&5),8); // 10,11,10,11 14,15,14,15 + y[1] = ((threadIdx.x&3) < 2) ? (u+v) : ((u-v) << (4*(threadIdx.x&1))); + } + + // BUTTERFLY( 0, 1, 0); + // BUTTERFLY( 2, 3, 0); + // BUTTERFLY( 4, 5, 0); + // BUTTERFLY( 6, 7, 0); + { + u= __shfl((int)y[0], (threadIdx.x&6),8); // 0,0,2,2 4,4,6,6 + v= __shfl((int)y[0],1+(threadIdx.x&6),8); // 1,1,3,3 5,5,7,7 + y[0] = ((threadIdx.x&1) < 1) ? (u+v) : (u-v); + } + + // BUTTERFLY( 8, 9, 0); + // BUTTERFLY(10, 11, 0); + // BUTTERFLY(12, 13, 0); + // BUTTERFLY(14, 15, 0); + { + u= __shfl((int)y[1], (threadIdx.x&6),8); // 8,8,10,10 12,12,14,14 + v= __shfl((int)y[1],1+(threadIdx.x&6),8); // 9,9,11,11 13,13,15,15 + y[1] = ((threadIdx.x&1) < 1) ? (u+v) : (u-v); + } + + DO_REDUCE_FULL_S( 0); // 0...7 + DO_REDUCE_FULL_S( 1); // 8...15 -#undef X -#undef DO_REDUCE #undef DO_REDUCE_FULL_S -#undef BUTTERFLY } -__device__ __forceinline__ void FFT_128_full(int *y) { +__device__ __forceinline__ void FFT_128_full(int y[128]) { int i; -#pragma unroll 16 - for (i=0; i<16; i++) { - FFT_8(y+i,16); - } + FFT_8(y+0,2); // eight parallel FFT8's + FFT_8(y+1,2); // eight parallel FFT8's -#pragma unroll 128 - for (i=0; i<128; i++) - /*if (i & 7)*/ y[i] = REDUCE(y[i]*c_FFT128_8_16_Twiddle[i]); +#pragma unroll 16 + for (i=0; i<16; i++) + /*if (i & 7)*/ y[i] = REDUCE(y[i]*c_FFT128_8_16_Twiddle[i*8+(threadIdx.x&7)]); #pragma unroll 8 - for (i=0; i<8; i++) { - FFT_16(y+16*i,1); - } + for (i=0; i<8; i++) + FFT_16(y+2*i); // eight sequential FFT16's, each one executed in parallel by 8 threads } @@ -435,116 +306,323 @@ __device__ __forceinline__ void FFT_256_halfzero(int y[256]) { * Output data is in revbin_permuted order. * In place. */ - const int tmp = y[127]; + const int tmp = y[15]; -#pragma unroll 127 - for (i=0; i<127; i++) - y[128+i] = REDUCE(y[i] * c_FFT256_2_128_Twiddle[i]); +#pragma unroll 8 + for (i=0; i<8; i++) + y[16+i] = REDUCE(y[i] * c_FFT256_2_128_Twiddle[8*i+(threadIdx.x&7)]); +#pragma unroll 8 + for (i=8; i<16; i++) + y[16+i] = 0; - /* handle X^255 with an additionnal butterfly */ - y[127] = REDUCE(tmp + 1); - y[255] = REDUCE((tmp - 1) * c_FFT256_2_128_Twiddle[127]); + /* handle X^255 with an additional butterfly */ + if ((threadIdx.x&7) == 7) + { + y[15] = REDUCE(tmp + 1); + y[31] = REDUCE((tmp - 1) * c_FFT256_2_128_Twiddle[127]); + } FFT_128_full(y); - FFT_128_full(y+128); + FFT_128_full(y+16); } -__device__ __forceinline__ void SIMD_Compress(uint32_t A[32], const int *expanded, const uint32_t *M) { - - uint32_t IV[4][8]; - int i; - - /* Save the chaining value for the feed-forward */ - -#pragma unroll 8 - for(i=0; i<8; i++) { - IV[0][i] = A[i]; - IV[1][i] = (&A[8])[i]; - IV[2][i] = (&A[16])[i]; - IV[3][i] = (&A[24])[i]; - } +/***************************************************/ - /* XOR the message to the chaining value */ - /* we can XOR word-by-word */ - - { -#pragma unroll 8 - for(i=0; i<8; i++) { - A[i] ^= M[i]; - (&A[8])[i] ^= M[8+i]; - } - } +__device__ __forceinline__ void Expansion(const uint32_t *data, uint4 *g_temp4) +{ + int i; - /* Run the feistel ladders with the expanded message */ - { - Round8(A, expanded, 0, 3, 23, 17, 27); - Round8(A, expanded, 1, 28, 19, 22, 7); - Round8(A, expanded, 2, 29, 9, 15, 5); - Round8(A, expanded, 3, 4, 13, 10, 25); - - STEP8_IF(IV[0], 32, 4, 13, A, &A[8], &A[16], &A[24]); - STEP8_IF(IV[1], 33, 13, 10, &A[24], A, &A[8], &A[16]); - STEP8_IF(IV[2], 34, 10, 25, &A[16], &A[24], A, &A[8]); - STEP8_IF(IV[3], 35, 25, 4, &A[8], &A[16], &A[24], A); + /* Message Expansion using Number Theoretical Transform similar to FFT */ + int expanded[32]; +#pragma unroll 4 + for (i=0; i < 4; i++) { + expanded[ i] = __byte_perm(__shfl((int)data[0], 2*i, 8), __shfl((int)data[0], (2*i)+1, 8), threadIdx.x&7)&0xff; + expanded[4+i] = __byte_perm(__shfl((int)data[1], 2*i, 8), __shfl((int)data[1], (2*i)+1, 8), threadIdx.x&7)&0xff; } +#pragma unroll 8 + for (i=8; i < 16; i++) + expanded[i] = 0; + + FFT_256_halfzero(expanded); + + // store w matrices in global memory + +#define mul_185(x) ( (x)*185 ) +#define mul_233(x) ( (x)*233 ) + + uint4 vec0; + int P, Q, P1, Q1, P2, Q2; + bool even = (threadIdx.x & 1) == 0; + +// 0 8 4 12 2 10 6 14 16 24 20 28 18 26 22 30 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 +// 0 8 4 12 2 10 6 14 16 24 20 28 18 26 22 30 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 +// 0 8 4 12 2 10 6 14 16 24 20 28 18 26 22 30 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 +// 0 8 4 12 2 10 6 14 16 24 20 28 18 26 22 30 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 + + // 2 6 0 4 + const int perm0[8] = { 2,3,6,7,0,1,4,5 }; // TODO: das landet im lmem. doof. + + P1 = expanded[ 0]; P2 = __shfl(expanded[ 2], (threadIdx.x-1)&7, 8); P = even ? P1 : P2; + Q1 = expanded[16]; Q2 = __shfl(expanded[18], (threadIdx.x-1)&7, 8); Q = even ? Q1 : Q2; + vec0.x = __shfl((int)__byte_perm(mul_185(P), mul_185(Q) , 0x5410), perm0[threadIdx.x&7], 8); + P1 = expanded[ 8]; P2 = __shfl(expanded[10], (threadIdx.x-1)&7, 8); P = even ? P1 : P2; + Q1 = expanded[24]; Q2 = __shfl(expanded[26], (threadIdx.x-1)&7, 8); Q = even ? Q1 : Q2; + vec0.y = __shfl((int)__byte_perm(mul_185(P), mul_185(Q) , 0x5410), perm0[threadIdx.x&7], 8); + P1 = expanded[ 4]; P2 = __shfl(expanded[ 6], (threadIdx.x-1)&7, 8); P = even ? P1 : P2; + Q1 = expanded[20]; Q2 = __shfl(expanded[22], (threadIdx.x-1)&7, 8); Q = even ? Q1 : Q2; + vec0.z = __shfl((int)__byte_perm(mul_185(P), mul_185(Q) , 0x5410), perm0[threadIdx.x&7], 8); + P1 = expanded[12]; P2 = __shfl(expanded[14], (threadIdx.x-1)&7, 8); P = even ? P1 : P2; + Q1 = expanded[28]; Q2 = __shfl(expanded[30], (threadIdx.x-1)&7, 8); Q = even ? Q1 : Q2; + vec0.w = __shfl((int)__byte_perm(mul_185(P), mul_185(Q) , 0x5410), perm0[threadIdx.x&7], 8); + g_temp4[threadIdx.x&7] = vec0; + +// 1 9 5 13 3 11 7 15 17 25 21 29 19 27 23 31 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 +// 1 9 5 13 3 11 7 15 17 25 21 29 19 27 23 31 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 +// 1 9 5 13 3 11 7 15 17 25 21 29 19 27 23 31 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 +// 1 9 5 13 3 11 7 15 17 25 21 29 19 27 23 31 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 + + // 6 2 4 0 + const int perm1[8] = { 6,7,2,3,4,5,0,1 }; // TODO: das landet im lmem. doof. + + P1 = expanded[ 1]; P2 = __shfl(expanded[ 3], (threadIdx.x-1)&7, 8); P = even ? P1 : P2; + Q1 = expanded[17]; Q2 = __shfl(expanded[19], (threadIdx.x-1)&7, 8); Q = even ? Q1 : Q2; + vec0.x = __shfl((int)__byte_perm(mul_185(P), mul_185(Q) , 0x5410), perm1[threadIdx.x&7], 8); + P1 = expanded[ 9]; P2 = __shfl(expanded[11], (threadIdx.x-1)&7, 8); P = even ? P1 : P2; + Q1 = expanded[25]; Q2 = __shfl(expanded[27], (threadIdx.x-1)&7, 8); Q = even ? Q1 : Q2; + vec0.y = __shfl((int)__byte_perm(mul_185(P), mul_185(Q) , 0x5410), perm1[threadIdx.x&7], 8); + P1 = expanded[ 5]; P2 = __shfl(expanded[ 7], (threadIdx.x-1)&7, 8); P = even ? P1 : P2; + Q1 = expanded[21]; Q2 = __shfl(expanded[23], (threadIdx.x-1)&7, 8); Q = even ? Q1 : Q2; + vec0.z = __shfl((int)__byte_perm(mul_185(P), mul_185(Q) , 0x5410), perm1[threadIdx.x&7], 8); + P1 = expanded[13]; P2 = __shfl(expanded[15], (threadIdx.x-1)&7, 8); P = even ? P1 : P2; + Q1 = expanded[29]; Q2 = __shfl(expanded[31], (threadIdx.x-1)&7, 8); Q = even ? Q1 : Q2; + vec0.w = __shfl((int)__byte_perm(mul_185(P), mul_185(Q) , 0x5410), perm1[threadIdx.x&7], 8); + g_temp4[8+(threadIdx.x&7)] = vec0; + +// 1 9 5 13 3 11 7 15 17 25 21 29 19 27 23 31 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 +// 1 9 5 13 3 11 7 15 17 25 21 29 19 27 23 31 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 +// 0 8 4 12 2 10 6 14 16 24 20 28 18 26 22 30 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 +// 0 8 4 12 2 10 6 14 16 24 20 28 18 26 22 30 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 + + // 7 5 3 1 + const int perm2[8] = { 7,6,5,4,3,2,1,0 }; // TODO: das landet im lmem. doof. + + bool hi = (threadIdx.x&7)>=4; + + P1 = hi?expanded[ 1]:expanded[ 0]; P2 = __shfl(hi?expanded[ 3]:expanded[ 2], (threadIdx.x+1)&7, 8); P = !even ? P1 : P2; + Q1 = hi?expanded[17]:expanded[16]; Q2 = __shfl(hi?expanded[19]:expanded[18], (threadIdx.x+1)&7, 8); Q = !even ? Q1 : Q2; + vec0.x = __shfl((int)__byte_perm(mul_185(P), mul_185(Q) , 0x5410), perm2[threadIdx.x&7], 8); + P1 = hi?expanded[ 9]:expanded[ 8]; P2 = __shfl(hi?expanded[11]:expanded[10], (threadIdx.x+1)&7, 8); P = !even ? P1 : P2; + Q1 = hi?expanded[25]:expanded[24]; Q2 = __shfl(hi?expanded[27]:expanded[26], (threadIdx.x+1)&7, 8); Q = !even ? Q1 : Q2; + vec0.y = __shfl((int)__byte_perm(mul_185(P), mul_185(Q) , 0x5410), perm2[threadIdx.x&7], 8); + P1 = hi?expanded[ 5]:expanded[ 4]; P2 = __shfl(hi?expanded[ 7]:expanded[ 6], (threadIdx.x+1)&7, 8); P = !even ? P1 : P2; + Q1 = hi?expanded[21]:expanded[20]; Q2 = __shfl(hi?expanded[23]:expanded[22], (threadIdx.x+1)&7, 8); Q = !even ? Q1 : Q2; + vec0.z = __shfl((int)__byte_perm(mul_185(P), mul_185(Q) , 0x5410), perm2[threadIdx.x&7], 8); + P1 = hi?expanded[13]:expanded[12]; P2 = __shfl(hi?expanded[15]:expanded[14], (threadIdx.x+1)&7, 8); P = !even ? P1 : P2; + Q1 = hi?expanded[29]:expanded[28]; Q2 = __shfl(hi?expanded[31]:expanded[30], (threadIdx.x+1)&7, 8); Q = !even ? Q1 : Q2; + vec0.w = __shfl((int)__byte_perm(mul_185(P), mul_185(Q) , 0x5410), perm2[threadIdx.x&7], 8); + g_temp4[16+(threadIdx.x&7)] = vec0; + +// 1 9 5 13 3 11 7 15 17 25 21 29 19 27 23 31 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 +// 1 9 5 13 3 11 7 15 17 25 21 29 19 27 23 31 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 +// 0 8 4 12 2 10 6 14 16 24 20 28 18 26 22 30 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 +// 0 8 4 12 2 10 6 14 16 24 20 28 18 26 22 30 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 + + // 1 3 5 7 + const int perm3[8] = { 1,0,3,2,5,4,7,6 }; // TODO: das landet im lmem. doof. + + bool lo = (threadIdx.x&7)<4; + + P1 = lo?expanded[ 1]:expanded[ 0]; P2 = __shfl(lo?expanded[ 3]:expanded[ 2], (threadIdx.x+1)&7, 8); P = !even ? P1 : P2; + Q1 = lo?expanded[17]:expanded[16]; Q2 = __shfl(lo?expanded[19]:expanded[18], (threadIdx.x+1)&7, 8); Q = !even ? Q1 : Q2; + vec0.x = __shfl((int)__byte_perm(mul_185(P), mul_185(Q) , 0x5410), perm3[threadIdx.x&7], 8); + P1 = lo?expanded[ 9]:expanded[ 8]; P2 = __shfl(lo?expanded[11]:expanded[10], (threadIdx.x+1)&7, 8); P = !even ? P1 : P2; + Q1 = lo?expanded[25]:expanded[24]; Q2 = __shfl(lo?expanded[27]:expanded[26], (threadIdx.x+1)&7, 8); Q = !even ? Q1 : Q2; + vec0.y = __shfl((int)__byte_perm(mul_185(P), mul_185(Q) , 0x5410), perm3[threadIdx.x&7], 8); + P1 = lo?expanded[ 5]:expanded[ 4]; P2 = __shfl(lo?expanded[ 7]:expanded[ 6], (threadIdx.x+1)&7, 8); P = !even ? P1 : P2; + Q1 = lo?expanded[21]:expanded[20]; Q2 = __shfl(lo?expanded[23]:expanded[22], (threadIdx.x+1)&7, 8); Q = !even ? Q1 : Q2; + vec0.z = __shfl((int)__byte_perm(mul_185(P), mul_185(Q) , 0x5410), perm3[threadIdx.x&7], 8); + P1 = lo?expanded[13]:expanded[12]; P2 = __shfl(lo?expanded[15]:expanded[14], (threadIdx.x+1)&7, 8); P = !even ? P1 : P2; + Q1 = lo?expanded[29]:expanded[28]; Q2 = __shfl(lo?expanded[31]:expanded[30], (threadIdx.x+1)&7, 8); Q = !even ? Q1 : Q2; + vec0.w = __shfl((int)__byte_perm(mul_185(P), mul_185(Q) , 0x5410), perm3[threadIdx.x&7], 8); + g_temp4[24+(threadIdx.x&7)] = vec0; + +// 1 9 5 13 3 11 7 15 1 9 5 13 3 11 7 15 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 +// 0 8 4 12 2 10 6 14 0 8 4 12 2 10 6 14 4 4 4 4 4 4 4 4 5 5 5 5 5 5 5 5 +// 1 9 5 13 3 11 7 15 1 9 5 13 3 11 7 15 6 6 6 6 6 6 6 6 7 7 7 7 7 7 7 7 +// 0 8 4 12 2 10 6 14 0 8 4 12 2 10 6 14 2 2 2 2 2 2 2 2 3 3 3 3 3 3 3 3 + +//{ 8, 72, 40, 104, 24, 88, 56, 120 }, { 9, 73, 41, 105, 25, 89, 57, 121 }, +//{ 4, 68, 36, 100, 20, 84, 52, 116 }, { 5, 69, 37, 101, 21, 85, 53, 117 }, +//{ 14, 78, 46, 110, 30, 94, 62, 126 }, { 15, 79, 47, 111, 31, 95, 63, 127 }, +//{ 2, 66, 34, 98, 18, 82, 50, 114 }, { 3, 67, 35, 99, 19, 83, 51, 115 }, + + const int perm4[8] = { 0,1,4,5,6,7,2,3 }; // TODO: das landet im lmem. doof. + + bool sel = ((threadIdx.x+2)&7) >= 4; // 2,3,4,5 + + P1 = sel?expanded[0]:expanded[1]; Q1 = __shfl(P1, threadIdx.x^1, 8); + Q2 = sel?expanded[2]:expanded[3]; P2 = __shfl(Q2, threadIdx.x^1, 8); + P = even? P1 : P2; Q = even? Q1 : Q2; + vec0.x = __shfl((int)__byte_perm(mul_233(P), mul_233(Q) , 0x5410), perm4[threadIdx.x&7], 8); + P1 = sel?expanded[8]:expanded[9]; Q1 = __shfl(P1, threadIdx.x^1, 8); + Q2 = sel?expanded[10]:expanded[11]; P2 = __shfl(Q2, threadIdx.x^1, 8); + P = even? P1 : P2; Q = even? Q1 : Q2; + vec0.y = __shfl((int)__byte_perm(mul_233(P), mul_233(Q) , 0x5410), perm4[threadIdx.x&7], 8); + P1 = sel?expanded[4]:expanded[5]; Q1 = __shfl(P1, threadIdx.x^1, 8); + Q2 = sel?expanded[6]:expanded[7]; P2 = __shfl(Q2, threadIdx.x^1, 8); + P = even? P1 : P2; Q = even? Q1 : Q2; + vec0.z = __shfl((int)__byte_perm(mul_233(P), mul_233(Q) , 0x5410), perm4[threadIdx.x&7], 8); + P1 = sel?expanded[12]:expanded[13]; Q1 = __shfl(P1, threadIdx.x^1, 8); + Q2 = sel?expanded[14]:expanded[15]; P2 = __shfl(Q2, threadIdx.x^1, 8); + P = even? P1 : P2; Q = even? Q1 : Q2; + vec0.w = __shfl((int)__byte_perm(mul_233(P), mul_233(Q) , 0x5410), perm4[threadIdx.x&7], 8); + + g_temp4[32+(threadIdx.x&7)] = vec0; + +// 0 8 4 12 2 10 6 14 0 8 4 12 2 10 6 14 6 6 6 6 6 6 6 6 7 7 7 7 7 7 7 7 +// 1 9 5 13 3 11 7 15 1 9 5 13 3 11 7 15 2 2 2 2 2 2 2 2 3 3 3 3 3 3 3 3 +// 0 8 4 12 2 10 6 14 0 8 4 12 2 10 6 14 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 +// 1 9 5 13 3 11 7 15 1 9 5 13 3 11 7 15 4 4 4 4 4 4 4 4 5 5 5 5 5 5 5 5 + + const int perm5[8] = { 6,7,2,3,0,1,4,5 }; // TODO: das landet im lmem. doof. + + P1 = sel?expanded[1]:expanded[0]; Q1 = __shfl(P1, threadIdx.x^1, 8); + Q2 = sel?expanded[3]:expanded[2]; P2 = __shfl(Q2, threadIdx.x^1, 8); + P = even? P1 : P2; Q = even? Q1 : Q2; + vec0.x = __shfl((int)__byte_perm(mul_233(P), mul_233(Q) , 0x5410), perm5[threadIdx.x&7], 8); + P1 = sel?expanded[9]:expanded[8]; Q1 = __shfl(P1, threadIdx.x^1, 8); + Q2 = sel?expanded[11]:expanded[10]; P2 = __shfl(Q2, threadIdx.x^1, 8); + P = even? P1 : P2; Q = even? Q1 : Q2; + vec0.y = __shfl((int)__byte_perm(mul_233(P), mul_233(Q) , 0x5410), perm5[threadIdx.x&7], 8); + P1 = sel?expanded[5]:expanded[4]; Q1 = __shfl(P1, threadIdx.x^1, 8); + Q2 = sel?expanded[7]:expanded[6]; P2 = __shfl(Q2, threadIdx.x^1, 8); + P = even? P1 : P2; Q = even? Q1 : Q2; + vec0.z = __shfl((int)__byte_perm(mul_233(P), mul_233(Q) , 0x5410), perm5[threadIdx.x&7], 8); + P1 = sel?expanded[13]:expanded[12]; Q1 = __shfl(P1, threadIdx.x^1, 8); + Q2 = sel?expanded[15]:expanded[14]; P2 = __shfl(Q2, threadIdx.x^1, 8); + P = even? P1 : P2; Q = even? Q1 : Q2; + vec0.w = __shfl((int)__byte_perm(mul_233(P), mul_233(Q) , 0x5410), perm5[threadIdx.x&7], 8); + + g_temp4[40+(threadIdx.x&7)] = vec0; + +// 16 24 20 28 18 26 22 30 16 24 20 28 18 26 22 30 6 6 6 6 6 6 6 6 7 7 7 7 7 7 7 7 +// 16 24 20 28 18 26 22 30 16 24 20 28 18 26 22 30 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 +// 17 25 21 29 19 27 23 31 17 25 21 29 19 27 23 31 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 +// 17 25 21 29 19 27 23 31 17 25 21 29 19 27 23 31 6 6 6 6 6 6 6 6 7 7 7 7 7 7 7 7 + + const int perm6[8] = { 6,7,0,1,4,5,2,3 }; // TODO: das landet im lmem. doof. + // sel markiert threads 2,3,4,5 + + int t; + t = __shfl(expanded[17],(threadIdx.x+4)&7,8); P1 = sel?t:expanded[16]; Q1 = __shfl(P1, threadIdx.x^1, 8); + t = __shfl(expanded[19],(threadIdx.x+4)&7,8); Q2 = sel?t:expanded[18]; P2 = __shfl(Q2, threadIdx.x^1, 8); + P = even? P1 : P2; Q = even? Q1 : Q2; + vec0.x = __shfl((int)__byte_perm(mul_233(P), mul_233(Q) , 0x5410), perm6[threadIdx.x&7], 8); + t = __shfl(expanded[25],(threadIdx.x+4)&7,8); P1 = sel?t:expanded[24]; Q1 = __shfl(P1, threadIdx.x^1, 8); + t = __shfl(expanded[27],(threadIdx.x+4)&7,8); Q2 = sel?t:expanded[26]; P2 = __shfl(Q2, threadIdx.x^1, 8); + P = even? P1 : P2; Q = even? Q1 : Q2; + vec0.y = __shfl((int)__byte_perm(mul_233(P), mul_233(Q) , 0x5410), perm6[threadIdx.x&7], 8); + t = __shfl(expanded[21],(threadIdx.x+4)&7,8); P1 = sel?t:expanded[20]; Q1 = __shfl(P1, threadIdx.x^1, 8); + t = __shfl(expanded[23],(threadIdx.x+4)&7,8); Q2 = sel?t:expanded[22]; P2 = __shfl(Q2, threadIdx.x^1, 8); + P = even? P1 : P2; Q = even? Q1 : Q2; + vec0.z = __shfl((int)__byte_perm(mul_233(P), mul_233(Q) , 0x5410), perm6[threadIdx.x&7], 8); + t = __shfl(expanded[29],(threadIdx.x+4)&7,8); P1 = sel?t:expanded[28]; Q1 = __shfl(P1, threadIdx.x^1, 8); + t = __shfl(expanded[31],(threadIdx.x+4)&7,8); Q2 = sel?t:expanded[30]; P2 = __shfl(Q2, threadIdx.x^1, 8); + P = even? P1 : P2; Q = even? Q1 : Q2; + vec0.w = __shfl((int)__byte_perm(mul_233(P), mul_233(Q) , 0x5410), perm6[threadIdx.x&7], 8); + + g_temp4[48+(threadIdx.x&7)] = vec0; + +// 17 25 21 29 19 27 23 31 17 25 21 29 19 27 23 31 4 4 4 4 4 4 4 4 5 5 5 5 5 5 5 5 +// 17 25 21 29 19 27 23 31 17 25 21 29 19 27 23 31 2 2 2 2 2 2 2 2 3 3 3 3 3 3 3 3 +// 16 24 20 28 18 26 22 30 16 24 20 28 18 26 22 30 2 2 2 2 2 2 2 2 3 3 3 3 3 3 3 3 +// 16 24 20 28 18 26 22 30 16 24 20 28 18 26 22 30 4 4 4 4 4 4 4 4 5 5 5 5 5 5 5 5 + + const int perm7[8] = { 4,5,2,3,6,7,0,1 }; // TODO: das landet im lmem. doof. + // sel markiert threads 2,3,4,5 + + t = __shfl(expanded[16],(threadIdx.x+4)&7,8); P1 = sel?expanded[17]:t; Q1 = __shfl(P1, threadIdx.x^1, 8); + t = __shfl(expanded[18],(threadIdx.x+4)&7,8); Q2 = sel?expanded[19]:t; P2 = __shfl(Q2, threadIdx.x^1, 8); + P = even? P1 : P2; Q = even? Q1 : Q2; + vec0.x = __shfl((int)__byte_perm(mul_233(P), mul_233(Q) , 0x5410), perm7[threadIdx.x&7], 8); + t = __shfl(expanded[24],(threadIdx.x+4)&7,8); P1 = sel?expanded[25]:t; Q1 = __shfl(P1, threadIdx.x^1, 8); + t = __shfl(expanded[26],(threadIdx.x+4)&7,8); Q2 = sel?expanded[27]:t; P2 = __shfl(Q2, threadIdx.x^1, 8); + P = even? P1 : P2; Q = even? Q1 : Q2; + vec0.y = __shfl((int)__byte_perm(mul_233(P), mul_233(Q) , 0x5410), perm7[threadIdx.x&7], 8); + t = __shfl(expanded[20],(threadIdx.x+4)&7,8); P1 = sel?expanded[21]:t; Q1 = __shfl(P1, threadIdx.x^1, 8); + t = __shfl(expanded[22],(threadIdx.x+4)&7,8); Q2 = sel?expanded[23]:t; P2 = __shfl(Q2, threadIdx.x^1, 8); + P = even? P1 : P2; Q = even? Q1 : Q2; + vec0.z = __shfl((int)__byte_perm(mul_233(P), mul_233(Q) , 0x5410), perm7[threadIdx.x&7], 8); + t = __shfl(expanded[28],(threadIdx.x+4)&7,8); P1 = sel?expanded[29]:t; Q1 = __shfl(P1, threadIdx.x^1, 8); + t = __shfl(expanded[30],(threadIdx.x+4)&7,8); Q2 = sel?expanded[31]:t; P2 = __shfl(Q2, threadIdx.x^1, 8); + P = even? P1 : P2; Q = even? Q1 : Q2; + vec0.w = __shfl((int)__byte_perm(mul_233(P), mul_233(Q) , 0x5410), perm7[threadIdx.x&7], 8); + + g_temp4[56+(threadIdx.x&7)] = vec0; + +#undef mul_185 +#undef mul_233 } - /***************************************************/ +// Die Hash-Funktion +__global__ void __launch_bounds__(TPB,4) +x11_simd512_gpu_expand_64(int threads, uint32_t startNounce, uint64_t *g_hash, uint32_t *g_nonceVector, uint4 *g_temp4) +{ + int thread = (blockDim.x * blockIdx.x + threadIdx.x)/8; + if (thread < threads) + { + uint32_t nounce = (g_nonceVector != NULL) ? g_nonceVector[thread] : (startNounce + thread); -__device__ __forceinline__ void SIMDHash(const uint32_t *data, uint32_t *hashval) { - - uint32_t A[32]; - int i; + int hashPosition = nounce - startNounce; - uint32_t buffer[16]; + uint32_t *inpHash = (uint32_t*)&g_hash[8 * hashPosition]; -#pragma unroll 32 - for (i=0; i < 32; i++) A[i] = c_IV_512[i]; + // Hash einlesen und auf 8 Threads und 2 Register verteilen + uint32_t Hash[2]; +#pragma unroll 2 + for (int i=0; i<2; i++) + Hash[i] = inpHash[8*i+(threadIdx.x&7)]; -#pragma unroll 16 - for (i=0; i < 16; i++) buffer[i] = data[i]; + // Puffer für expandierte Nachricht + uint4 *temp4 = &g_temp4[64 * hashPosition]; - /* Message Expansion using Number Theoretical Transform similar to FFT */ - int expanded[256]; - { -#pragma unroll 16 - for(i=0; i<64; i+=4) { - expanded[i+0] = __byte_perm(buffer[i/4],0,0x4440); - expanded[i+1] = __byte_perm(buffer[i/4],0,0x4441); - expanded[i+2] = __byte_perm(buffer[i/4],0,0x4442); - expanded[i+3] = __byte_perm(buffer[i/4],0,0x4443); - } -#pragma unroll 16 - for(i=64; i<128; i+=4) { - expanded[i+0] = 0; - expanded[i+1] = 0; - expanded[i+2] = 0; - expanded[i+3] = 0; + Expansion(Hash, temp4); } +} - FFT_256_halfzero(expanded); - } +__global__ void __launch_bounds__(TPB,4) +x11_simd512_gpu_compress1_64(int threads, uint32_t startNounce, uint64_t *g_hash, uint32_t *g_nonceVector, uint4 *g_fft4, int *g_state) +{ + int thread = (blockDim.x * blockIdx.x + threadIdx.x); + if (thread < threads) + { + uint32_t nounce = (g_nonceVector != NULL) ? g_nonceVector[thread] : (startNounce + thread); - /* Compression Function */ - SIMD_Compress(A, expanded, buffer); + int hashPosition = nounce - startNounce; + uint32_t *Hash = (uint32_t*)&g_hash[8 * hashPosition]; + + Compression1(Hash, hashPosition, g_fft4, g_state); + } +} - /* Padding Round with known input (hence the FFT can be precomputed) */ - buffer[0] = 512; -#pragma unroll 15 - for (i=1; i < 16; i++) buffer[i] = 0; +__global__ void __launch_bounds__(TPB,4) +x11_simd512_gpu_compress2_64(int threads, uint32_t startNounce, uint64_t *g_hash, uint32_t *g_nonceVector, uint4 *g_fft4, int *g_state) +{ + int thread = (blockDim.x * blockIdx.x + threadIdx.x); + if (thread < threads) + { + uint32_t nounce = (g_nonceVector != NULL) ? g_nonceVector[thread] : (startNounce + thread); - SIMD_Compress(A, c_FFT, buffer); + int hashPosition = nounce - startNounce; -#pragma unroll 16 - for (i=0; i < 16; i++) - hashval[i] = A[i]; + Compression2(hashPosition, g_fft4, g_state); + } } -/***************************************************/ -// Die Hash-Funktion -__global__ void x11_simd512_gpu_hash_64(int threads, uint32_t startNounce, uint64_t *g_hash, uint32_t *g_nonceVector) +__global__ void __launch_bounds__(TPB,4) +x11_simd512_gpu_final_64(int threads, uint32_t startNounce, uint64_t *g_hash, uint32_t *g_nonceVector, uint4 *g_fft4, int *g_state) { int thread = (blockDim.x * blockIdx.x + threadIdx.x); if (thread < threads) @@ -554,36 +632,60 @@ __global__ void x11_simd512_gpu_hash_64(int threads, uint32_t startNounce, uint6 int hashPosition = nounce - startNounce; uint32_t *Hash = (uint32_t*)&g_hash[8 * hashPosition]; - SIMDHash(Hash, Hash); + Final(Hash, hashPosition, g_fft4, g_state); } } - // Setup-Funktionen __host__ void x11_simd512_cpu_init(int thr_id, int threads) { + cudaMalloc( &d_state[thr_id], 32*sizeof(int)*threads ); + cudaMalloc( &d_temp4[thr_id], 64*sizeof(uint4)*threads ); + +#if 1 + // Textur für 128 Bit Zugriffe + cudaChannelFormatDesc channelDesc128 = cudaCreateChannelDesc(); + texRef1D_128.normalized = 0; + texRef1D_128.filterMode = cudaFilterModePoint; + texRef1D_128.addressMode[0] = cudaAddressModeClamp; + cudaBindTexture(NULL, &texRef1D_128, d_temp4[thr_id], &channelDesc128, 64*sizeof(uint4)*threads); +#endif + cudaMemcpyToSymbol( c_IV_512, h_IV_512, sizeof(h_IV_512), 0, cudaMemcpyHostToDevice); - cudaMemcpyToSymbol( c_FFT, h_FFT, sizeof(h_FFT), 0, cudaMemcpyHostToDevice); - cudaMemcpyToSymbol( c_P8, h_P8, sizeof(h_P8), 0, cudaMemcpyHostToDevice); - cudaMemcpyToSymbol( c_Q8, h_Q8, sizeof(h_Q8), 0, cudaMemcpyHostToDevice); cudaMemcpyToSymbol( c_FFT128_8_16_Twiddle, h_FFT128_8_16_Twiddle, sizeof(h_FFT128_8_16_Twiddle), 0, cudaMemcpyHostToDevice); cudaMemcpyToSymbol( c_FFT256_2_128_Twiddle, h_FFT256_2_128_Twiddle, sizeof(h_FFT256_2_128_Twiddle), 0, cudaMemcpyHostToDevice); + + + // CH + cudaMemcpyToSymbol( d_cw0, h_cw0, sizeof(h_cw0), 0, cudaMemcpyHostToDevice); + cudaMemcpyToSymbol( d_cw1, h_cw1, sizeof(h_cw1), 0, cudaMemcpyHostToDevice); + cudaMemcpyToSymbol( d_cw2, h_cw2, sizeof(h_cw2), 0, cudaMemcpyHostToDevice); + cudaMemcpyToSymbol( d_cw3, h_cw3, sizeof(h_cw3), 0, cudaMemcpyHostToDevice); + +// cudaFuncSetCacheConfig(x11_simd512_gpu_compress1_64, cudaFuncCachePreferL1); +// cudaFuncSetCacheConfig(x11_simd512_gpu_compress2_64, cudaFuncCachePreferL1); } __host__ void x11_simd512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order) { - const int threadsperblock = 256; + const int threadsperblock = TPB; + + // Größe des dynamischen Shared Memory Bereichs + size_t shared_size = 0; // berechne wie viele Thread Blocks wir brauchen - dim3 grid((threads + threadsperblock-1)/threadsperblock); dim3 block(threadsperblock); - // Größe des dynamischen Shared Memory Bereichs - size_t shared_size = 0; + dim3 grid8(((threads + threadsperblock-1)/threadsperblock)*8); + x11_simd512_gpu_expand_64<<>>(threads, startNounce, (uint64_t*)d_hash, d_nonceVector, d_temp4[thr_id]); -// fprintf(stderr, "threads=%d, %d blocks, %d threads per block, %d bytes shared\n", threads, grid.x, block.x, shared_size); + dim3 grid((threads + threadsperblock-1)/threadsperblock); + + // künstlich die Occupancy limitieren, um das totale Erschöpfen des Texture Cache zu vermeiden + x11_simd512_gpu_compress1_64<<>>(threads, startNounce, (uint64_t*)d_hash, d_nonceVector, d_temp4[thr_id], d_state[thr_id]); + x11_simd512_gpu_compress2_64<<>>(threads, startNounce, (uint64_t*)d_hash, d_nonceVector, d_temp4[thr_id], d_state[thr_id]); + + x11_simd512_gpu_final_64<<>>(threads, startNounce, (uint64_t*)d_hash, d_nonceVector, d_temp4[thr_id], d_state[thr_id]); - x11_simd512_gpu_hash_64<<>>(threads, startNounce, (uint64_t*)d_hash, d_nonceVector); MyStreamSynchronize(NULL, order, thr_id); } - diff --git a/x11/simd_functions.cu b/x11/simd_functions.cu new file mode 100644 index 0000000000..fe5697dd21 --- /dev/null +++ b/x11/simd_functions.cu @@ -0,0 +1,1413 @@ +__device__ __forceinline__ void STEP8_IF_0(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D) +{ + int j; + uint32_t temp; + uint32_t R[8]; +#pragma unroll 8 + for(j=0; j<8; j++) { + R[j] = ROTL32(A[j], r); + } + temp = D[0] + w[0] + IF(A[0], B[0], C[0]); + D[0] = ROTL32(temp, s) + R[1]; + temp = D[1] + w[1] + IF(A[1], B[1], C[1]); + D[1] = ROTL32(temp, s) + R[0]; + temp = D[2] + w[2] + IF(A[2], B[2], C[2]); + D[2] = ROTL32(temp, s) + R[3]; + temp = D[3] + w[3] + IF(A[3], B[3], C[3]); + D[3] = ROTL32(temp, s) + R[2]; + temp = D[4] + w[4] + IF(A[4], B[4], C[4]); + D[4] = ROTL32(temp, s) + R[5]; + temp = D[5] + w[5] + IF(A[5], B[5], C[5]); + D[5] = ROTL32(temp, s) + R[4]; + temp = D[6] + w[6] + IF(A[6], B[6], C[6]); + D[6] = ROTL32(temp, s) + R[7]; + temp = D[7] + w[7] + IF(A[7], B[7], C[7]); + D[7] = ROTL32(temp, s) + R[6]; +#pragma unroll 8 + for(j=0; j<8; j++) { + A[j] = R[j]; + } +} +__device__ __forceinline__ void STEP8_IF_1(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D) +{ + int j; + uint32_t temp; + uint32_t R[8]; +#pragma unroll 8 + for(j=0; j<8; j++) { + R[j] = ROTL32(A[j], r); + } + temp = D[0] + w[0] + IF(A[0], B[0], C[0]); + D[0] = ROTL32(temp, s) + R[6]; + temp = D[1] + w[1] + IF(A[1], B[1], C[1]); + D[1] = ROTL32(temp, s) + R[7]; + temp = D[2] + w[2] + IF(A[2], B[2], C[2]); + D[2] = ROTL32(temp, s) + R[4]; + temp = D[3] + w[3] + IF(A[3], B[3], C[3]); + D[3] = ROTL32(temp, s) + R[5]; + temp = D[4] + w[4] + IF(A[4], B[4], C[4]); + D[4] = ROTL32(temp, s) + R[2]; + temp = D[5] + w[5] + IF(A[5], B[5], C[5]); + D[5] = ROTL32(temp, s) + R[3]; + temp = D[6] + w[6] + IF(A[6], B[6], C[6]); + D[6] = ROTL32(temp, s) + R[0]; + temp = D[7] + w[7] + IF(A[7], B[7], C[7]); + D[7] = ROTL32(temp, s) + R[1]; +#pragma unroll 8 + for(j=0; j<8; j++) { + A[j] = R[j]; + } +} +__device__ __forceinline__ void STEP8_IF_2(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D) +{ + int j; + uint32_t temp; + uint32_t R[8]; +#pragma unroll 8 + for(j=0; j<8; j++) { + R[j] = ROTL32(A[j], r); + } + temp = D[0] + w[0] + IF(A[0], B[0], C[0]); + D[0] = ROTL32(temp, s) + R[2]; + temp = D[1] + w[1] + IF(A[1], B[1], C[1]); + D[1] = ROTL32(temp, s) + R[3]; + temp = D[2] + w[2] + IF(A[2], B[2], C[2]); + D[2] = ROTL32(temp, s) + R[0]; + temp = D[3] + w[3] + IF(A[3], B[3], C[3]); + D[3] = ROTL32(temp, s) + R[1]; + temp = D[4] + w[4] + IF(A[4], B[4], C[4]); + D[4] = ROTL32(temp, s) + R[6]; + temp = D[5] + w[5] + IF(A[5], B[5], C[5]); + D[5] = ROTL32(temp, s) + R[7]; + temp = D[6] + w[6] + IF(A[6], B[6], C[6]); + D[6] = ROTL32(temp, s) + R[4]; + temp = D[7] + w[7] + IF(A[7], B[7], C[7]); + D[7] = ROTL32(temp, s) + R[5]; +#pragma unroll 8 + for(j=0; j<8; j++) { + A[j] = R[j]; + } +} +__device__ __forceinline__ void STEP8_IF_3(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D) +{ + int j; + uint32_t temp; + uint32_t R[8]; +#pragma unroll 8 + for(j=0; j<8; j++) { + R[j] = ROTL32(A[j], r); + } + temp = D[0] + w[0] + IF(A[0], B[0], C[0]); + D[0] = ROTL32(temp, s) + R[3]; + temp = D[1] + w[1] + IF(A[1], B[1], C[1]); + D[1] = ROTL32(temp, s) + R[2]; + temp = D[2] + w[2] + IF(A[2], B[2], C[2]); + D[2] = ROTL32(temp, s) + R[1]; + temp = D[3] + w[3] + IF(A[3], B[3], C[3]); + D[3] = ROTL32(temp, s) + R[0]; + temp = D[4] + w[4] + IF(A[4], B[4], C[4]); + D[4] = ROTL32(temp, s) + R[7]; + temp = D[5] + w[5] + IF(A[5], B[5], C[5]); + D[5] = ROTL32(temp, s) + R[6]; + temp = D[6] + w[6] + IF(A[6], B[6], C[6]); + D[6] = ROTL32(temp, s) + R[5]; + temp = D[7] + w[7] + IF(A[7], B[7], C[7]); + D[7] = ROTL32(temp, s) + R[4]; +#pragma unroll 8 + for(j=0; j<8; j++) { + A[j] = R[j]; + } +} +__device__ __forceinline__ void STEP8_MAJ_4(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D) +{ + int j; + uint32_t temp; + uint32_t R[8]; +#pragma unroll 8 + for(j=0; j<8; j++) { + R[j] = ROTL32(A[j], r); + } + temp = D[0] + w[0] + MAJ(A[0], B[0], C[0]); + D[0] = ROTL32(temp, s) + R[5]; + temp = D[1] + w[1] + MAJ(A[1], B[1], C[1]); + D[1] = ROTL32(temp, s) + R[4]; + temp = D[2] + w[2] + MAJ(A[2], B[2], C[2]); + D[2] = ROTL32(temp, s) + R[7]; + temp = D[3] + w[3] + MAJ(A[3], B[3], C[3]); + D[3] = ROTL32(temp, s) + R[6]; + temp = D[4] + w[4] + MAJ(A[4], B[4], C[4]); + D[4] = ROTL32(temp, s) + R[1]; + temp = D[5] + w[5] + MAJ(A[5], B[5], C[5]); + D[5] = ROTL32(temp, s) + R[0]; + temp = D[6] + w[6] + MAJ(A[6], B[6], C[6]); + D[6] = ROTL32(temp, s) + R[3]; + temp = D[7] + w[7] + MAJ(A[7], B[7], C[7]); + D[7] = ROTL32(temp, s) + R[2]; +#pragma unroll 8 + for(j=0; j<8; j++) { + A[j] = R[j]; + } +} +__device__ __forceinline__ void STEP8_MAJ_5(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D) +{ + int j; + uint32_t temp; + uint32_t R[8]; +#pragma unroll 8 + for(j=0; j<8; j++) { + R[j] = ROTL32(A[j], r); + } + temp = D[0] + w[0] + MAJ(A[0], B[0], C[0]); + D[0] = ROTL32(temp, s) + R[7]; + temp = D[1] + w[1] + MAJ(A[1], B[1], C[1]); + D[1] = ROTL32(temp, s) + R[6]; + temp = D[2] + w[2] + MAJ(A[2], B[2], C[2]); + D[2] = ROTL32(temp, s) + R[5]; + temp = D[3] + w[3] + MAJ(A[3], B[3], C[3]); + D[3] = ROTL32(temp, s) + R[4]; + temp = D[4] + w[4] + MAJ(A[4], B[4], C[4]); + D[4] = ROTL32(temp, s) + R[3]; + temp = D[5] + w[5] + MAJ(A[5], B[5], C[5]); + D[5] = ROTL32(temp, s) + R[2]; + temp = D[6] + w[6] + MAJ(A[6], B[6], C[6]); + D[6] = ROTL32(temp, s) + R[1]; + temp = D[7] + w[7] + MAJ(A[7], B[7], C[7]); + D[7] = ROTL32(temp, s) + R[0]; +#pragma unroll 8 + for(j=0; j<8; j++) { + A[j] = R[j]; + } +} +__device__ __forceinline__ void STEP8_MAJ_6(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D) +{ + int j; + uint32_t temp; + uint32_t R[8]; +#pragma unroll 8 + for(j=0; j<8; j++) { + R[j] = ROTL32(A[j], r); + } + temp = D[0] + w[0] + MAJ(A[0], B[0], C[0]); + D[0] = ROTL32(temp, s) + R[4]; + temp = D[1] + w[1] + MAJ(A[1], B[1], C[1]); + D[1] = ROTL32(temp, s) + R[5]; + temp = D[2] + w[2] + MAJ(A[2], B[2], C[2]); + D[2] = ROTL32(temp, s) + R[6]; + temp = D[3] + w[3] + MAJ(A[3], B[3], C[3]); + D[3] = ROTL32(temp, s) + R[7]; + temp = D[4] + w[4] + MAJ(A[4], B[4], C[4]); + D[4] = ROTL32(temp, s) + R[0]; + temp = D[5] + w[5] + MAJ(A[5], B[5], C[5]); + D[5] = ROTL32(temp, s) + R[1]; + temp = D[6] + w[6] + MAJ(A[6], B[6], C[6]); + D[6] = ROTL32(temp, s) + R[2]; + temp = D[7] + w[7] + MAJ(A[7], B[7], C[7]); + D[7] = ROTL32(temp, s) + R[3]; +#pragma unroll 8 + for(j=0; j<8; j++) { + A[j] = R[j]; + } +} +__device__ __forceinline__ void STEP8_MAJ_7(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D) +{ + int j; + uint32_t temp; + uint32_t R[8]; +#pragma unroll 8 + for(j=0; j<8; j++) { + R[j] = ROTL32(A[j], r); + } + temp = D[0] + w[0] + MAJ(A[0], B[0], C[0]); + D[0] = ROTL32(temp, s) + R[1]; + temp = D[1] + w[1] + MAJ(A[1], B[1], C[1]); + D[1] = ROTL32(temp, s) + R[0]; + temp = D[2] + w[2] + MAJ(A[2], B[2], C[2]); + D[2] = ROTL32(temp, s) + R[3]; + temp = D[3] + w[3] + MAJ(A[3], B[3], C[3]); + D[3] = ROTL32(temp, s) + R[2]; + temp = D[4] + w[4] + MAJ(A[4], B[4], C[4]); + D[4] = ROTL32(temp, s) + R[5]; + temp = D[5] + w[5] + MAJ(A[5], B[5], C[5]); + D[5] = ROTL32(temp, s) + R[4]; + temp = D[6] + w[6] + MAJ(A[6], B[6], C[6]); + D[6] = ROTL32(temp, s) + R[7]; + temp = D[7] + w[7] + MAJ(A[7], B[7], C[7]); + D[7] = ROTL32(temp, s) + R[6]; +#pragma unroll 8 + for(j=0; j<8; j++) { + A[j] = R[j]; + } +} +__device__ __forceinline__ void STEP8_IF_8(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D) +{ + int j; + uint32_t temp; + uint32_t R[8]; +#pragma unroll 8 + for(j=0; j<8; j++) { + R[j] = ROTL32(A[j], r); + } + temp = D[0] + w[0] + IF(A[0], B[0], C[0]); + D[0] = ROTL32(temp, s) + R[6]; + temp = D[1] + w[1] + IF(A[1], B[1], C[1]); + D[1] = ROTL32(temp, s) + R[7]; + temp = D[2] + w[2] + IF(A[2], B[2], C[2]); + D[2] = ROTL32(temp, s) + R[4]; + temp = D[3] + w[3] + IF(A[3], B[3], C[3]); + D[3] = ROTL32(temp, s) + R[5]; + temp = D[4] + w[4] + IF(A[4], B[4], C[4]); + D[4] = ROTL32(temp, s) + R[2]; + temp = D[5] + w[5] + IF(A[5], B[5], C[5]); + D[5] = ROTL32(temp, s) + R[3]; + temp = D[6] + w[6] + IF(A[6], B[6], C[6]); + D[6] = ROTL32(temp, s) + R[0]; + temp = D[7] + w[7] + IF(A[7], B[7], C[7]); + D[7] = ROTL32(temp, s) + R[1]; +#pragma unroll 8 + for(j=0; j<8; j++) { + A[j] = R[j]; + } +} +__device__ __forceinline__ void STEP8_IF_9(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D) +{ + int j; + uint32_t temp; + uint32_t R[8]; +#pragma unroll 8 + for(j=0; j<8; j++) { + R[j] = ROTL32(A[j], r); + } + temp = D[0] + w[0] + IF(A[0], B[0], C[0]); + D[0] = ROTL32(temp, s) + R[2]; + temp = D[1] + w[1] + IF(A[1], B[1], C[1]); + D[1] = ROTL32(temp, s) + R[3]; + temp = D[2] + w[2] + IF(A[2], B[2], C[2]); + D[2] = ROTL32(temp, s) + R[0]; + temp = D[3] + w[3] + IF(A[3], B[3], C[3]); + D[3] = ROTL32(temp, s) + R[1]; + temp = D[4] + w[4] + IF(A[4], B[4], C[4]); + D[4] = ROTL32(temp, s) + R[6]; + temp = D[5] + w[5] + IF(A[5], B[5], C[5]); + D[5] = ROTL32(temp, s) + R[7]; + temp = D[6] + w[6] + IF(A[6], B[6], C[6]); + D[6] = ROTL32(temp, s) + R[4]; + temp = D[7] + w[7] + IF(A[7], B[7], C[7]); + D[7] = ROTL32(temp, s) + R[5]; +#pragma unroll 8 + for(j=0; j<8; j++) { + A[j] = R[j]; + } +} +__device__ __forceinline__ void STEP8_IF_10(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D) +{ + int j; + uint32_t temp; + uint32_t R[8]; +#pragma unroll 8 + for(j=0; j<8; j++) { + R[j] = ROTL32(A[j], r); + } + temp = D[0] + w[0] + IF(A[0], B[0], C[0]); + D[0] = ROTL32(temp, s) + R[3]; + temp = D[1] + w[1] + IF(A[1], B[1], C[1]); + D[1] = ROTL32(temp, s) + R[2]; + temp = D[2] + w[2] + IF(A[2], B[2], C[2]); + D[2] = ROTL32(temp, s) + R[1]; + temp = D[3] + w[3] + IF(A[3], B[3], C[3]); + D[3] = ROTL32(temp, s) + R[0]; + temp = D[4] + w[4] + IF(A[4], B[4], C[4]); + D[4] = ROTL32(temp, s) + R[7]; + temp = D[5] + w[5] + IF(A[5], B[5], C[5]); + D[5] = ROTL32(temp, s) + R[6]; + temp = D[6] + w[6] + IF(A[6], B[6], C[6]); + D[6] = ROTL32(temp, s) + R[5]; + temp = D[7] + w[7] + IF(A[7], B[7], C[7]); + D[7] = ROTL32(temp, s) + R[4]; +#pragma unroll 8 + for(j=0; j<8; j++) { + A[j] = R[j]; + } +} +__device__ __forceinline__ void STEP8_IF_11(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D) +{ + int j; + uint32_t temp; + uint32_t R[8]; +#pragma unroll 8 + for(j=0; j<8; j++) { + R[j] = ROTL32(A[j], r); + } + temp = D[0] + w[0] + IF(A[0], B[0], C[0]); + D[0] = ROTL32(temp, s) + R[5]; + temp = D[1] + w[1] + IF(A[1], B[1], C[1]); + D[1] = ROTL32(temp, s) + R[4]; + temp = D[2] + w[2] + IF(A[2], B[2], C[2]); + D[2] = ROTL32(temp, s) + R[7]; + temp = D[3] + w[3] + IF(A[3], B[3], C[3]); + D[3] = ROTL32(temp, s) + R[6]; + temp = D[4] + w[4] + IF(A[4], B[4], C[4]); + D[4] = ROTL32(temp, s) + R[1]; + temp = D[5] + w[5] + IF(A[5], B[5], C[5]); + D[5] = ROTL32(temp, s) + R[0]; + temp = D[6] + w[6] + IF(A[6], B[6], C[6]); + D[6] = ROTL32(temp, s) + R[3]; + temp = D[7] + w[7] + IF(A[7], B[7], C[7]); + D[7] = ROTL32(temp, s) + R[2]; +#pragma unroll 8 + for(j=0; j<8; j++) { + A[j] = R[j]; + } +} +__device__ __forceinline__ void STEP8_MAJ_12(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D) +{ + int j; + uint32_t temp; + uint32_t R[8]; +#pragma unroll 8 + for(j=0; j<8; j++) { + R[j] = ROTL32(A[j], r); + } + temp = D[0] + w[0] + MAJ(A[0], B[0], C[0]); + D[0] = ROTL32(temp, s) + R[7]; + temp = D[1] + w[1] + MAJ(A[1], B[1], C[1]); + D[1] = ROTL32(temp, s) + R[6]; + temp = D[2] + w[2] + MAJ(A[2], B[2], C[2]); + D[2] = ROTL32(temp, s) + R[5]; + temp = D[3] + w[3] + MAJ(A[3], B[3], C[3]); + D[3] = ROTL32(temp, s) + R[4]; + temp = D[4] + w[4] + MAJ(A[4], B[4], C[4]); + D[4] = ROTL32(temp, s) + R[3]; + temp = D[5] + w[5] + MAJ(A[5], B[5], C[5]); + D[5] = ROTL32(temp, s) + R[2]; + temp = D[6] + w[6] + MAJ(A[6], B[6], C[6]); + D[6] = ROTL32(temp, s) + R[1]; + temp = D[7] + w[7] + MAJ(A[7], B[7], C[7]); + D[7] = ROTL32(temp, s) + R[0]; +#pragma unroll 8 + for(j=0; j<8; j++) { + A[j] = R[j]; + } +} +__device__ __forceinline__ void STEP8_MAJ_13(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D) +{ + int j; + uint32_t temp; + uint32_t R[8]; +#pragma unroll 8 + for(j=0; j<8; j++) { + R[j] = ROTL32(A[j], r); + } + temp = D[0] + w[0] + MAJ(A[0], B[0], C[0]); + D[0] = ROTL32(temp, s) + R[4]; + temp = D[1] + w[1] + MAJ(A[1], B[1], C[1]); + D[1] = ROTL32(temp, s) + R[5]; + temp = D[2] + w[2] + MAJ(A[2], B[2], C[2]); + D[2] = ROTL32(temp, s) + R[6]; + temp = D[3] + w[3] + MAJ(A[3], B[3], C[3]); + D[3] = ROTL32(temp, s) + R[7]; + temp = D[4] + w[4] + MAJ(A[4], B[4], C[4]); + D[4] = ROTL32(temp, s) + R[0]; + temp = D[5] + w[5] + MAJ(A[5], B[5], C[5]); + D[5] = ROTL32(temp, s) + R[1]; + temp = D[6] + w[6] + MAJ(A[6], B[6], C[6]); + D[6] = ROTL32(temp, s) + R[2]; + temp = D[7] + w[7] + MAJ(A[7], B[7], C[7]); + D[7] = ROTL32(temp, s) + R[3]; +#pragma unroll 8 + for(j=0; j<8; j++) { + A[j] = R[j]; + } +} +__device__ __forceinline__ void STEP8_MAJ_14(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D) +{ + int j; + uint32_t temp; + uint32_t R[8]; +#pragma unroll 8 + for(j=0; j<8; j++) { + R[j] = ROTL32(A[j], r); + } + temp = D[0] + w[0] + MAJ(A[0], B[0], C[0]); + D[0] = ROTL32(temp, s) + R[1]; + temp = D[1] + w[1] + MAJ(A[1], B[1], C[1]); + D[1] = ROTL32(temp, s) + R[0]; + temp = D[2] + w[2] + MAJ(A[2], B[2], C[2]); + D[2] = ROTL32(temp, s) + R[3]; + temp = D[3] + w[3] + MAJ(A[3], B[3], C[3]); + D[3] = ROTL32(temp, s) + R[2]; + temp = D[4] + w[4] + MAJ(A[4], B[4], C[4]); + D[4] = ROTL32(temp, s) + R[5]; + temp = D[5] + w[5] + MAJ(A[5], B[5], C[5]); + D[5] = ROTL32(temp, s) + R[4]; + temp = D[6] + w[6] + MAJ(A[6], B[6], C[6]); + D[6] = ROTL32(temp, s) + R[7]; + temp = D[7] + w[7] + MAJ(A[7], B[7], C[7]); + D[7] = ROTL32(temp, s) + R[6]; +#pragma unroll 8 + for(j=0; j<8; j++) { + A[j] = R[j]; + } +} +__device__ __forceinline__ void STEP8_MAJ_15(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D) +{ + int j; + uint32_t temp; + uint32_t R[8]; +#pragma unroll 8 + for(j=0; j<8; j++) { + R[j] = ROTL32(A[j], r); + } + temp = D[0] + w[0] + MAJ(A[0], B[0], C[0]); + D[0] = ROTL32(temp, s) + R[6]; + temp = D[1] + w[1] + MAJ(A[1], B[1], C[1]); + D[1] = ROTL32(temp, s) + R[7]; + temp = D[2] + w[2] + MAJ(A[2], B[2], C[2]); + D[2] = ROTL32(temp, s) + R[4]; + temp = D[3] + w[3] + MAJ(A[3], B[3], C[3]); + D[3] = ROTL32(temp, s) + R[5]; + temp = D[4] + w[4] + MAJ(A[4], B[4], C[4]); + D[4] = ROTL32(temp, s) + R[2]; + temp = D[5] + w[5] + MAJ(A[5], B[5], C[5]); + D[5] = ROTL32(temp, s) + R[3]; + temp = D[6] + w[6] + MAJ(A[6], B[6], C[6]); + D[6] = ROTL32(temp, s) + R[0]; + temp = D[7] + w[7] + MAJ(A[7], B[7], C[7]); + D[7] = ROTL32(temp, s) + R[1]; +#pragma unroll 8 + for(j=0; j<8; j++) { + A[j] = R[j]; + } +} +__device__ __forceinline__ void STEP8_IF_16(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D) +{ + int j; + uint32_t temp; + uint32_t R[8]; +#pragma unroll 8 + for(j=0; j<8; j++) { + R[j] = ROTL32(A[j], r); + } + temp = D[0] + w[0] + IF(A[0], B[0], C[0]); + D[0] = ROTL32(temp, s) + R[2]; + temp = D[1] + w[1] + IF(A[1], B[1], C[1]); + D[1] = ROTL32(temp, s) + R[3]; + temp = D[2] + w[2] + IF(A[2], B[2], C[2]); + D[2] = ROTL32(temp, s) + R[0]; + temp = D[3] + w[3] + IF(A[3], B[3], C[3]); + D[3] = ROTL32(temp, s) + R[1]; + temp = D[4] + w[4] + IF(A[4], B[4], C[4]); + D[4] = ROTL32(temp, s) + R[6]; + temp = D[5] + w[5] + IF(A[5], B[5], C[5]); + D[5] = ROTL32(temp, s) + R[7]; + temp = D[6] + w[6] + IF(A[6], B[6], C[6]); + D[6] = ROTL32(temp, s) + R[4]; + temp = D[7] + w[7] + IF(A[7], B[7], C[7]); + D[7] = ROTL32(temp, s) + R[5]; +#pragma unroll 8 + for(j=0; j<8; j++) { + A[j] = R[j]; + } +} +__device__ __forceinline__ void STEP8_IF_17(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D) +{ + int j; + uint32_t temp; + uint32_t R[8]; +#pragma unroll 8 + for(j=0; j<8; j++) { + R[j] = ROTL32(A[j], r); + } + temp = D[0] + w[0] + IF(A[0], B[0], C[0]); + D[0] = ROTL32(temp, s) + R[3]; + temp = D[1] + w[1] + IF(A[1], B[1], C[1]); + D[1] = ROTL32(temp, s) + R[2]; + temp = D[2] + w[2] + IF(A[2], B[2], C[2]); + D[2] = ROTL32(temp, s) + R[1]; + temp = D[3] + w[3] + IF(A[3], B[3], C[3]); + D[3] = ROTL32(temp, s) + R[0]; + temp = D[4] + w[4] + IF(A[4], B[4], C[4]); + D[4] = ROTL32(temp, s) + R[7]; + temp = D[5] + w[5] + IF(A[5], B[5], C[5]); + D[5] = ROTL32(temp, s) + R[6]; + temp = D[6] + w[6] + IF(A[6], B[6], C[6]); + D[6] = ROTL32(temp, s) + R[5]; + temp = D[7] + w[7] + IF(A[7], B[7], C[7]); + D[7] = ROTL32(temp, s) + R[4]; +#pragma unroll 8 + for(j=0; j<8; j++) { + A[j] = R[j]; + } +} +__device__ __forceinline__ void STEP8_IF_18(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D) +{ + int j; + uint32_t temp; + uint32_t R[8]; +#pragma unroll 8 + for(j=0; j<8; j++) { + R[j] = ROTL32(A[j], r); + } + temp = D[0] + w[0] + IF(A[0], B[0], C[0]); + D[0] = ROTL32(temp, s) + R[5]; + temp = D[1] + w[1] + IF(A[1], B[1], C[1]); + D[1] = ROTL32(temp, s) + R[4]; + temp = D[2] + w[2] + IF(A[2], B[2], C[2]); + D[2] = ROTL32(temp, s) + R[7]; + temp = D[3] + w[3] + IF(A[3], B[3], C[3]); + D[3] = ROTL32(temp, s) + R[6]; + temp = D[4] + w[4] + IF(A[4], B[4], C[4]); + D[4] = ROTL32(temp, s) + R[1]; + temp = D[5] + w[5] + IF(A[5], B[5], C[5]); + D[5] = ROTL32(temp, s) + R[0]; + temp = D[6] + w[6] + IF(A[6], B[6], C[6]); + D[6] = ROTL32(temp, s) + R[3]; + temp = D[7] + w[7] + IF(A[7], B[7], C[7]); + D[7] = ROTL32(temp, s) + R[2]; +#pragma unroll 8 + for(j=0; j<8; j++) { + A[j] = R[j]; + } +} +__device__ __forceinline__ void STEP8_IF_19(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D) +{ + int j; + uint32_t temp; + uint32_t R[8]; +#pragma unroll 8 + for(j=0; j<8; j++) { + R[j] = ROTL32(A[j], r); + } + temp = D[0] + w[0] + IF(A[0], B[0], C[0]); + D[0] = ROTL32(temp, s) + R[7]; + temp = D[1] + w[1] + IF(A[1], B[1], C[1]); + D[1] = ROTL32(temp, s) + R[6]; + temp = D[2] + w[2] + IF(A[2], B[2], C[2]); + D[2] = ROTL32(temp, s) + R[5]; + temp = D[3] + w[3] + IF(A[3], B[3], C[3]); + D[3] = ROTL32(temp, s) + R[4]; + temp = D[4] + w[4] + IF(A[4], B[4], C[4]); + D[4] = ROTL32(temp, s) + R[3]; + temp = D[5] + w[5] + IF(A[5], B[5], C[5]); + D[5] = ROTL32(temp, s) + R[2]; + temp = D[6] + w[6] + IF(A[6], B[6], C[6]); + D[6] = ROTL32(temp, s) + R[1]; + temp = D[7] + w[7] + IF(A[7], B[7], C[7]); + D[7] = ROTL32(temp, s) + R[0]; +#pragma unroll 8 + for(j=0; j<8; j++) { + A[j] = R[j]; + } +} +__device__ __forceinline__ void STEP8_MAJ_20(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D) +{ + int j; + uint32_t temp; + uint32_t R[8]; +#pragma unroll 8 + for(j=0; j<8; j++) { + R[j] = ROTL32(A[j], r); + } + temp = D[0] + w[0] + MAJ(A[0], B[0], C[0]); + D[0] = ROTL32(temp, s) + R[4]; + temp = D[1] + w[1] + MAJ(A[1], B[1], C[1]); + D[1] = ROTL32(temp, s) + R[5]; + temp = D[2] + w[2] + MAJ(A[2], B[2], C[2]); + D[2] = ROTL32(temp, s) + R[6]; + temp = D[3] + w[3] + MAJ(A[3], B[3], C[3]); + D[3] = ROTL32(temp, s) + R[7]; + temp = D[4] + w[4] + MAJ(A[4], B[4], C[4]); + D[4] = ROTL32(temp, s) + R[0]; + temp = D[5] + w[5] + MAJ(A[5], B[5], C[5]); + D[5] = ROTL32(temp, s) + R[1]; + temp = D[6] + w[6] + MAJ(A[6], B[6], C[6]); + D[6] = ROTL32(temp, s) + R[2]; + temp = D[7] + w[7] + MAJ(A[7], B[7], C[7]); + D[7] = ROTL32(temp, s) + R[3]; +#pragma unroll 8 + for(j=0; j<8; j++) { + A[j] = R[j]; + } +} +__device__ __forceinline__ void STEP8_MAJ_21(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D) +{ + int j; + uint32_t temp; + uint32_t R[8]; +#pragma unroll 8 + for(j=0; j<8; j++) { + R[j] = ROTL32(A[j], r); + } + temp = D[0] + w[0] + MAJ(A[0], B[0], C[0]); + D[0] = ROTL32(temp, s) + R[1]; + temp = D[1] + w[1] + MAJ(A[1], B[1], C[1]); + D[1] = ROTL32(temp, s) + R[0]; + temp = D[2] + w[2] + MAJ(A[2], B[2], C[2]); + D[2] = ROTL32(temp, s) + R[3]; + temp = D[3] + w[3] + MAJ(A[3], B[3], C[3]); + D[3] = ROTL32(temp, s) + R[2]; + temp = D[4] + w[4] + MAJ(A[4], B[4], C[4]); + D[4] = ROTL32(temp, s) + R[5]; + temp = D[5] + w[5] + MAJ(A[5], B[5], C[5]); + D[5] = ROTL32(temp, s) + R[4]; + temp = D[6] + w[6] + MAJ(A[6], B[6], C[6]); + D[6] = ROTL32(temp, s) + R[7]; + temp = D[7] + w[7] + MAJ(A[7], B[7], C[7]); + D[7] = ROTL32(temp, s) + R[6]; +#pragma unroll 8 + for(j=0; j<8; j++) { + A[j] = R[j]; + } +} +__device__ __forceinline__ void STEP8_MAJ_22(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D) +{ + int j; + uint32_t temp; + uint32_t R[8]; +#pragma unroll 8 + for(j=0; j<8; j++) { + R[j] = ROTL32(A[j], r); + } + temp = D[0] + w[0] + MAJ(A[0], B[0], C[0]); + D[0] = ROTL32(temp, s) + R[6]; + temp = D[1] + w[1] + MAJ(A[1], B[1], C[1]); + D[1] = ROTL32(temp, s) + R[7]; + temp = D[2] + w[2] + MAJ(A[2], B[2], C[2]); + D[2] = ROTL32(temp, s) + R[4]; + temp = D[3] + w[3] + MAJ(A[3], B[3], C[3]); + D[3] = ROTL32(temp, s) + R[5]; + temp = D[4] + w[4] + MAJ(A[4], B[4], C[4]); + D[4] = ROTL32(temp, s) + R[2]; + temp = D[5] + w[5] + MAJ(A[5], B[5], C[5]); + D[5] = ROTL32(temp, s) + R[3]; + temp = D[6] + w[6] + MAJ(A[6], B[6], C[6]); + D[6] = ROTL32(temp, s) + R[0]; + temp = D[7] + w[7] + MAJ(A[7], B[7], C[7]); + D[7] = ROTL32(temp, s) + R[1]; +#pragma unroll 8 + for(j=0; j<8; j++) { + A[j] = R[j]; + } +} +__device__ __forceinline__ void STEP8_MAJ_23(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D) +{ + int j; + uint32_t temp; + uint32_t R[8]; +#pragma unroll 8 + for(j=0; j<8; j++) { + R[j] = ROTL32(A[j], r); + } + temp = D[0] + w[0] + MAJ(A[0], B[0], C[0]); + D[0] = ROTL32(temp, s) + R[2]; + temp = D[1] + w[1] + MAJ(A[1], B[1], C[1]); + D[1] = ROTL32(temp, s) + R[3]; + temp = D[2] + w[2] + MAJ(A[2], B[2], C[2]); + D[2] = ROTL32(temp, s) + R[0]; + temp = D[3] + w[3] + MAJ(A[3], B[3], C[3]); + D[3] = ROTL32(temp, s) + R[1]; + temp = D[4] + w[4] + MAJ(A[4], B[4], C[4]); + D[4] = ROTL32(temp, s) + R[6]; + temp = D[5] + w[5] + MAJ(A[5], B[5], C[5]); + D[5] = ROTL32(temp, s) + R[7]; + temp = D[6] + w[6] + MAJ(A[6], B[6], C[6]); + D[6] = ROTL32(temp, s) + R[4]; + temp = D[7] + w[7] + MAJ(A[7], B[7], C[7]); + D[7] = ROTL32(temp, s) + R[5]; +#pragma unroll 8 + for(j=0; j<8; j++) { + A[j] = R[j]; + } +} +__device__ __forceinline__ void STEP8_IF_24(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D) +{ + int j; + uint32_t temp; + uint32_t R[8]; +#pragma unroll 8 + for(j=0; j<8; j++) { + R[j] = ROTL32(A[j], r); + } + temp = D[0] + w[0] + IF(A[0], B[0], C[0]); + D[0] = ROTL32(temp, s) + R[3]; + temp = D[1] + w[1] + IF(A[1], B[1], C[1]); + D[1] = ROTL32(temp, s) + R[2]; + temp = D[2] + w[2] + IF(A[2], B[2], C[2]); + D[2] = ROTL32(temp, s) + R[1]; + temp = D[3] + w[3] + IF(A[3], B[3], C[3]); + D[3] = ROTL32(temp, s) + R[0]; + temp = D[4] + w[4] + IF(A[4], B[4], C[4]); + D[4] = ROTL32(temp, s) + R[7]; + temp = D[5] + w[5] + IF(A[5], B[5], C[5]); + D[5] = ROTL32(temp, s) + R[6]; + temp = D[6] + w[6] + IF(A[6], B[6], C[6]); + D[6] = ROTL32(temp, s) + R[5]; + temp = D[7] + w[7] + IF(A[7], B[7], C[7]); + D[7] = ROTL32(temp, s) + R[4]; +#pragma unroll 8 + for(j=0; j<8; j++) { + A[j] = R[j]; + } +} +__device__ __forceinline__ void STEP8_IF_25(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D) +{ + int j; + uint32_t temp; + uint32_t R[8]; +#pragma unroll 8 + for(j=0; j<8; j++) { + R[j] = ROTL32(A[j], r); + } + temp = D[0] + w[0] + IF(A[0], B[0], C[0]); + D[0] = ROTL32(temp, s) + R[5]; + temp = D[1] + w[1] + IF(A[1], B[1], C[1]); + D[1] = ROTL32(temp, s) + R[4]; + temp = D[2] + w[2] + IF(A[2], B[2], C[2]); + D[2] = ROTL32(temp, s) + R[7]; + temp = D[3] + w[3] + IF(A[3], B[3], C[3]); + D[3] = ROTL32(temp, s) + R[6]; + temp = D[4] + w[4] + IF(A[4], B[4], C[4]); + D[4] = ROTL32(temp, s) + R[1]; + temp = D[5] + w[5] + IF(A[5], B[5], C[5]); + D[5] = ROTL32(temp, s) + R[0]; + temp = D[6] + w[6] + IF(A[6], B[6], C[6]); + D[6] = ROTL32(temp, s) + R[3]; + temp = D[7] + w[7] + IF(A[7], B[7], C[7]); + D[7] = ROTL32(temp, s) + R[2]; +#pragma unroll 8 + for(j=0; j<8; j++) { + A[j] = R[j]; + } +} +__device__ __forceinline__ void STEP8_IF_26(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D) +{ + int j; + uint32_t temp; + uint32_t R[8]; +#pragma unroll 8 + for(j=0; j<8; j++) { + R[j] = ROTL32(A[j], r); + } + temp = D[0] + w[0] + IF(A[0], B[0], C[0]); + D[0] = ROTL32(temp, s) + R[7]; + temp = D[1] + w[1] + IF(A[1], B[1], C[1]); + D[1] = ROTL32(temp, s) + R[6]; + temp = D[2] + w[2] + IF(A[2], B[2], C[2]); + D[2] = ROTL32(temp, s) + R[5]; + temp = D[3] + w[3] + IF(A[3], B[3], C[3]); + D[3] = ROTL32(temp, s) + R[4]; + temp = D[4] + w[4] + IF(A[4], B[4], C[4]); + D[4] = ROTL32(temp, s) + R[3]; + temp = D[5] + w[5] + IF(A[5], B[5], C[5]); + D[5] = ROTL32(temp, s) + R[2]; + temp = D[6] + w[6] + IF(A[6], B[6], C[6]); + D[6] = ROTL32(temp, s) + R[1]; + temp = D[7] + w[7] + IF(A[7], B[7], C[7]); + D[7] = ROTL32(temp, s) + R[0]; +#pragma unroll 8 + for(j=0; j<8; j++) { + A[j] = R[j]; + } +} +__device__ __forceinline__ void STEP8_IF_27(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D) +{ + int j; + uint32_t temp; + uint32_t R[8]; +#pragma unroll 8 + for(j=0; j<8; j++) { + R[j] = ROTL32(A[j], r); + } + temp = D[0] + w[0] + IF(A[0], B[0], C[0]); + D[0] = ROTL32(temp, s) + R[4]; + temp = D[1] + w[1] + IF(A[1], B[1], C[1]); + D[1] = ROTL32(temp, s) + R[5]; + temp = D[2] + w[2] + IF(A[2], B[2], C[2]); + D[2] = ROTL32(temp, s) + R[6]; + temp = D[3] + w[3] + IF(A[3], B[3], C[3]); + D[3] = ROTL32(temp, s) + R[7]; + temp = D[4] + w[4] + IF(A[4], B[4], C[4]); + D[4] = ROTL32(temp, s) + R[0]; + temp = D[5] + w[5] + IF(A[5], B[5], C[5]); + D[5] = ROTL32(temp, s) + R[1]; + temp = D[6] + w[6] + IF(A[6], B[6], C[6]); + D[6] = ROTL32(temp, s) + R[2]; + temp = D[7] + w[7] + IF(A[7], B[7], C[7]); + D[7] = ROTL32(temp, s) + R[3]; +#pragma unroll 8 + for(j=0; j<8; j++) { + A[j] = R[j]; + } +} +__device__ __forceinline__ void STEP8_MAJ_28(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D) +{ + int j; + uint32_t temp; + uint32_t R[8]; +#pragma unroll 8 + for(j=0; j<8; j++) { + R[j] = ROTL32(A[j], r); + } + temp = D[0] + w[0] + MAJ(A[0], B[0], C[0]); + D[0] = ROTL32(temp, s) + R[1]; + temp = D[1] + w[1] + MAJ(A[1], B[1], C[1]); + D[1] = ROTL32(temp, s) + R[0]; + temp = D[2] + w[2] + MAJ(A[2], B[2], C[2]); + D[2] = ROTL32(temp, s) + R[3]; + temp = D[3] + w[3] + MAJ(A[3], B[3], C[3]); + D[3] = ROTL32(temp, s) + R[2]; + temp = D[4] + w[4] + MAJ(A[4], B[4], C[4]); + D[4] = ROTL32(temp, s) + R[5]; + temp = D[5] + w[5] + MAJ(A[5], B[5], C[5]); + D[5] = ROTL32(temp, s) + R[4]; + temp = D[6] + w[6] + MAJ(A[6], B[6], C[6]); + D[6] = ROTL32(temp, s) + R[7]; + temp = D[7] + w[7] + MAJ(A[7], B[7], C[7]); + D[7] = ROTL32(temp, s) + R[6]; +#pragma unroll 8 + for(j=0; j<8; j++) { + A[j] = R[j]; + } +} +__device__ __forceinline__ void STEP8_MAJ_29(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D) +{ + int j; + uint32_t temp; + uint32_t R[8]; +#pragma unroll 8 + for(j=0; j<8; j++) { + R[j] = ROTL32(A[j], r); + } + temp = D[0] + w[0] + MAJ(A[0], B[0], C[0]); + D[0] = ROTL32(temp, s) + R[6]; + temp = D[1] + w[1] + MAJ(A[1], B[1], C[1]); + D[1] = ROTL32(temp, s) + R[7]; + temp = D[2] + w[2] + MAJ(A[2], B[2], C[2]); + D[2] = ROTL32(temp, s) + R[4]; + temp = D[3] + w[3] + MAJ(A[3], B[3], C[3]); + D[3] = ROTL32(temp, s) + R[5]; + temp = D[4] + w[4] + MAJ(A[4], B[4], C[4]); + D[4] = ROTL32(temp, s) + R[2]; + temp = D[5] + w[5] + MAJ(A[5], B[5], C[5]); + D[5] = ROTL32(temp, s) + R[3]; + temp = D[6] + w[6] + MAJ(A[6], B[6], C[6]); + D[6] = ROTL32(temp, s) + R[0]; + temp = D[7] + w[7] + MAJ(A[7], B[7], C[7]); + D[7] = ROTL32(temp, s) + R[1]; +#pragma unroll 8 + for(j=0; j<8; j++) { + A[j] = R[j]; + } +} +__device__ __forceinline__ void STEP8_MAJ_30(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D) +{ + int j; + uint32_t temp; + uint32_t R[8]; +#pragma unroll 8 + for(j=0; j<8; j++) { + R[j] = ROTL32(A[j], r); + } + temp = D[0] + w[0] + MAJ(A[0], B[0], C[0]); + D[0] = ROTL32(temp, s) + R[2]; + temp = D[1] + w[1] + MAJ(A[1], B[1], C[1]); + D[1] = ROTL32(temp, s) + R[3]; + temp = D[2] + w[2] + MAJ(A[2], B[2], C[2]); + D[2] = ROTL32(temp, s) + R[0]; + temp = D[3] + w[3] + MAJ(A[3], B[3], C[3]); + D[3] = ROTL32(temp, s) + R[1]; + temp = D[4] + w[4] + MAJ(A[4], B[4], C[4]); + D[4] = ROTL32(temp, s) + R[6]; + temp = D[5] + w[5] + MAJ(A[5], B[5], C[5]); + D[5] = ROTL32(temp, s) + R[7]; + temp = D[6] + w[6] + MAJ(A[6], B[6], C[6]); + D[6] = ROTL32(temp, s) + R[4]; + temp = D[7] + w[7] + MAJ(A[7], B[7], C[7]); + D[7] = ROTL32(temp, s) + R[5]; +#pragma unroll 8 + for(j=0; j<8; j++) { + A[j] = R[j]; + } +} +__device__ __forceinline__ void STEP8_MAJ_31(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D) +{ + int j; + uint32_t temp; + uint32_t R[8]; +#pragma unroll 8 + for(j=0; j<8; j++) { + R[j] = ROTL32(A[j], r); + } + temp = D[0] + w[0] + MAJ(A[0], B[0], C[0]); + D[0] = ROTL32(temp, s) + R[3]; + temp = D[1] + w[1] + MAJ(A[1], B[1], C[1]); + D[1] = ROTL32(temp, s) + R[2]; + temp = D[2] + w[2] + MAJ(A[2], B[2], C[2]); + D[2] = ROTL32(temp, s) + R[1]; + temp = D[3] + w[3] + MAJ(A[3], B[3], C[3]); + D[3] = ROTL32(temp, s) + R[0]; + temp = D[4] + w[4] + MAJ(A[4], B[4], C[4]); + D[4] = ROTL32(temp, s) + R[7]; + temp = D[5] + w[5] + MAJ(A[5], B[5], C[5]); + D[5] = ROTL32(temp, s) + R[6]; + temp = D[6] + w[6] + MAJ(A[6], B[6], C[6]); + D[6] = ROTL32(temp, s) + R[5]; + temp = D[7] + w[7] + MAJ(A[7], B[7], C[7]); + D[7] = ROTL32(temp, s) + R[4]; +#pragma unroll 8 + for(j=0; j<8; j++) { + A[j] = R[j]; + } +} +__device__ __forceinline__ void STEP8_IF_32(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D) +{ + int j; + uint32_t temp; + uint32_t R[8]; +#pragma unroll 8 + for(j=0; j<8; j++) { + R[j] = ROTL32(A[j], r); + } + temp = D[0] + w[0] + IF(A[0], B[0], C[0]); + D[0] = ROTL32(temp, s) + R[5]; + temp = D[1] + w[1] + IF(A[1], B[1], C[1]); + D[1] = ROTL32(temp, s) + R[4]; + temp = D[2] + w[2] + IF(A[2], B[2], C[2]); + D[2] = ROTL32(temp, s) + R[7]; + temp = D[3] + w[3] + IF(A[3], B[3], C[3]); + D[3] = ROTL32(temp, s) + R[6]; + temp = D[4] + w[4] + IF(A[4], B[4], C[4]); + D[4] = ROTL32(temp, s) + R[1]; + temp = D[5] + w[5] + IF(A[5], B[5], C[5]); + D[5] = ROTL32(temp, s) + R[0]; + temp = D[6] + w[6] + IF(A[6], B[6], C[6]); + D[6] = ROTL32(temp, s) + R[3]; + temp = D[7] + w[7] + IF(A[7], B[7], C[7]); + D[7] = ROTL32(temp, s) + R[2]; +#pragma unroll 8 + for(j=0; j<8; j++) { + A[j] = R[j]; + } +} +__device__ __forceinline__ void STEP8_IF_33(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D) +{ + int j; + uint32_t temp; + uint32_t R[8]; +#pragma unroll 8 + for(j=0; j<8; j++) { + R[j] = ROTL32(A[j], r); + } + temp = D[0] + w[0] + IF(A[0], B[0], C[0]); + D[0] = ROTL32(temp, s) + R[7]; + temp = D[1] + w[1] + IF(A[1], B[1], C[1]); + D[1] = ROTL32(temp, s) + R[6]; + temp = D[2] + w[2] + IF(A[2], B[2], C[2]); + D[2] = ROTL32(temp, s) + R[5]; + temp = D[3] + w[3] + IF(A[3], B[3], C[3]); + D[3] = ROTL32(temp, s) + R[4]; + temp = D[4] + w[4] + IF(A[4], B[4], C[4]); + D[4] = ROTL32(temp, s) + R[3]; + temp = D[5] + w[5] + IF(A[5], B[5], C[5]); + D[5] = ROTL32(temp, s) + R[2]; + temp = D[6] + w[6] + IF(A[6], B[6], C[6]); + D[6] = ROTL32(temp, s) + R[1]; + temp = D[7] + w[7] + IF(A[7], B[7], C[7]); + D[7] = ROTL32(temp, s) + R[0]; +#pragma unroll 8 + for(j=0; j<8; j++) { + A[j] = R[j]; + } +} +__device__ __forceinline__ void STEP8_IF_34(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D) +{ + int j; + uint32_t temp; + uint32_t R[8]; +#pragma unroll 8 + for(j=0; j<8; j++) { + R[j] = ROTL32(A[j], r); + } + temp = D[0] + w[0] + IF(A[0], B[0], C[0]); + D[0] = ROTL32(temp, s) + R[4]; + temp = D[1] + w[1] + IF(A[1], B[1], C[1]); + D[1] = ROTL32(temp, s) + R[5]; + temp = D[2] + w[2] + IF(A[2], B[2], C[2]); + D[2] = ROTL32(temp, s) + R[6]; + temp = D[3] + w[3] + IF(A[3], B[3], C[3]); + D[3] = ROTL32(temp, s) + R[7]; + temp = D[4] + w[4] + IF(A[4], B[4], C[4]); + D[4] = ROTL32(temp, s) + R[0]; + temp = D[5] + w[5] + IF(A[5], B[5], C[5]); + D[5] = ROTL32(temp, s) + R[1]; + temp = D[6] + w[6] + IF(A[6], B[6], C[6]); + D[6] = ROTL32(temp, s) + R[2]; + temp = D[7] + w[7] + IF(A[7], B[7], C[7]); + D[7] = ROTL32(temp, s) + R[3]; +#pragma unroll 8 + for(j=0; j<8; j++) { + A[j] = R[j]; + } +} +__device__ __forceinline__ void STEP8_IF_35(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D) +{ + int j; + uint32_t temp; + uint32_t R[8]; +#pragma unroll 8 + for(j=0; j<8; j++) { + R[j] = ROTL32(A[j], r); + } + temp = D[0] + w[0] + IF(A[0], B[0], C[0]); + D[0] = ROTL32(temp, s) + R[1]; + temp = D[1] + w[1] + IF(A[1], B[1], C[1]); + D[1] = ROTL32(temp, s) + R[0]; + temp = D[2] + w[2] + IF(A[2], B[2], C[2]); + D[2] = ROTL32(temp, s) + R[3]; + temp = D[3] + w[3] + IF(A[3], B[3], C[3]); + D[3] = ROTL32(temp, s) + R[2]; + temp = D[4] + w[4] + IF(A[4], B[4], C[4]); + D[4] = ROTL32(temp, s) + R[5]; + temp = D[5] + w[5] + IF(A[5], B[5], C[5]); + D[5] = ROTL32(temp, s) + R[4]; + temp = D[6] + w[6] + IF(A[6], B[6], C[6]); + D[6] = ROTL32(temp, s) + R[7]; + temp = D[7] + w[7] + IF(A[7], B[7], C[7]); + D[7] = ROTL32(temp, s) + R[6]; +#pragma unroll 8 + for(j=0; j<8; j++) { + A[j] = R[j]; + } +} +static __constant__ uint32_t d_cw0[8][8]; +static const uint32_t h_cw0[8][8] = { + 0x531B1720, 0xAC2CDE09, 0x0B902D87, 0x2369B1F4, 0x2931AA01, 0x02E4B082, 0xC914C914, 0xC1DAE1A6, + 0xF18C2B5C, 0x08AC306B, 0x27BFC914, 0xCEDC548D, 0xC630C4BE, 0xF18C4335, 0xF0D3427C, 0xBE3DA380, + 0x143C02E4, 0xA948C630, 0xA4F2DE09, 0xA71D2085, 0xA439BD84, 0x109FCD6A, 0xEEA8EF61, 0xA5AB1CE8, + 0x0B90D4A4, 0x3D6D039D, 0x25944D53, 0xBAA0E034, 0x5BC71E5A, 0xB1F4F2FE, 0x12CADE09, 0x548D41C3, + 0x3CB4F80D, 0x36ECEBC4, 0xA66443EE, 0x43351ABD, 0xC7A20C49, 0xEB0BB366, 0xF5293F98, 0x49B6DE09, + 0x531B29EA, 0x02E402E4, 0xDB25C405, 0x53D4E543, 0x0AD71720, 0xE1A61A04, 0xB87534C1, 0x3EDF43EE, + 0x213E50F0, 0x39173EDF, 0xA9485B0E, 0xEEA82EF9, 0x14F55771, 0xFAF15546, 0x3D6DD9B3, 0xAB73B92E, + 0x582A48FD, 0xEEA81892, 0x4F7EAA01, 0xAF10A88F, 0x11581720, 0x34C124DB, 0xD1C0AB73, 0x1E5AF0D3 +}; +__device__ __forceinline__ void Round8_0_final(uint32_t *A, + int r, int s, int t, int u) { + + + STEP8_IF_0(d_cw0[0], r, s, A, &A[8], &A[16], &A[24]); + STEP8_IF_1(d_cw0[1], s, t, &A[24], A, &A[8], &A[16]); + STEP8_IF_2(d_cw0[2], t, u, &A[16], &A[24], A, &A[8]); + STEP8_IF_3(d_cw0[3], u, r, &A[8], &A[16], &A[24], A); + STEP8_MAJ_4(d_cw0[4], r, s, A, &A[8], &A[16], &A[24]); + STEP8_MAJ_5(d_cw0[5], s, t, &A[24], A, &A[8], &A[16]); + STEP8_MAJ_6(d_cw0[6], t, u, &A[16], &A[24], A, &A[8]); + STEP8_MAJ_7(d_cw0[7], u, r, &A[8], &A[16], &A[24], A); +} +static __constant__ uint32_t d_cw1[8][8]; +static const uint32_t h_cw1[8][8] = { + 0xC34C07F3, 0xC914143C, 0x599CBC12, 0xBCCBE543, 0x385EF3B7, 0x14F54C9A, 0x0AD7C068, 0xB64A21F7, + 0xDEC2AF10, 0xC6E9C121, 0x56B8A4F2, 0x1158D107, 0xEB0BA88F, 0x050FAABA, 0xC293264D, 0x548D46D2, + 0xACE5E8E0, 0x53D421F7, 0xF470D279, 0xDC974E0C, 0xD6CF55FF, 0xFD1C4F7E, 0x36EC36EC, 0x3E261E5A, + 0xEBC4FD1C, 0x56B839D0, 0x5B0E21F7, 0x58E3DF7B, 0x5BC7427C, 0xEF613296, 0x1158109F, 0x5A55E318, + 0xA7D6B703, 0x1158E76E, 0xB08255FF, 0x50F05771, 0xEEA8E8E0, 0xCB3FDB25, 0x2E40548D, 0xE1A60F2D, + 0xACE5D616, 0xFD1CFD1C, 0x24DB3BFB, 0xAC2C1ABD, 0xF529E8E0, 0x1E5AE5FC, 0x478BCB3F, 0xC121BC12, + 0xF4702B5C, 0xC293FC63, 0xDA6CB2AD, 0x45601FCC, 0xA439E1A6, 0x4E0C0D02, 0xED3621F7, 0xAB73BE3D, + 0x0E74D4A4, 0xF754CF95, 0xD84136EC, 0x3124AB73, 0x39D03B42, 0x0E74BCCB, 0x0F2DBD84, 0x41C35C80 +}; +__device__ __forceinline__ void Round8_1_final(uint32_t *A, + int r, int s, int t, int u) { + + + STEP8_IF_8(d_cw1[0], r, s, A, &A[8], &A[16], &A[24]); + STEP8_IF_9(d_cw1[1], s, t, &A[24], A, &A[8], &A[16]); + STEP8_IF_10(d_cw1[2], t, u, &A[16], &A[24], A, &A[8]); + STEP8_IF_11(d_cw1[3], u, r, &A[8], &A[16], &A[24], A); + STEP8_MAJ_12(d_cw1[4], r, s, A, &A[8], &A[16], &A[24]); + STEP8_MAJ_13(d_cw1[5], s, t, &A[24], A, &A[8], &A[16]); + STEP8_MAJ_14(d_cw1[6], t, u, &A[16], &A[24], A, &A[8]); + STEP8_MAJ_15(d_cw1[7], u, r, &A[8], &A[16], &A[24], A); +} +static __constant__ uint32_t d_cw2[8][8]; +static const uint32_t h_cw2[8][8] = { + 0xA4135BED, 0xE10E1EF2, 0x6C4F93B1, 0x6E2191DF, 0xE2E01D20, 0xD1952E6B, 0x6A7D9583, 0x131DECE3, + 0x369CC964, 0xFB73048D, 0x9E9D6163, 0x280CD7F4, 0xD9C6263A, 0x1062EF9E, 0x2AC7D539, 0xAD2D52D3, + 0x0A03F5FD, 0x197CE684, 0xAA72558E, 0xDE5321AD, 0xF0870F79, 0x607A9F86, 0xAFE85018, 0x2AC7D539, + 0xE2E01D20, 0x2AC7D539, 0xC6A93957, 0x624C9DB4, 0x6C4F93B1, 0x641E9BE2, 0x452CBAD4, 0x263AD9C6, + 0xC964369C, 0xC3053CFB, 0x452CBAD4, 0x95836A7D, 0x4AA2B55E, 0xAB5B54A5, 0xAC4453BC, 0x74808B80, + 0xCB3634CA, 0xFC5C03A4, 0x4B8BB475, 0x21ADDE53, 0xE2E01D20, 0xDF3C20C4, 0xBD8F4271, 0xAA72558E, + 0xFC5C03A4, 0x48D0B730, 0x2AC7D539, 0xD70B28F5, 0x53BCAC44, 0x3FB6C04A, 0x14EFEB11, 0xDB982468, + 0x9A1065F0, 0xB0D14F2F, 0x8D5272AE, 0xC4D73B29, 0x91DF6E21, 0x949A6B66, 0x303DCFC3, 0x5932A6CE +}; +__device__ __forceinline__ void Round8_2_final(uint32_t *A, + int r, int s, int t, int u) { + + + STEP8_IF_16(d_cw2[0], r, s, A, &A[8], &A[16], &A[24]); + STEP8_IF_17(d_cw2[1], s, t, &A[24], A, &A[8], &A[16]); + STEP8_IF_18(d_cw2[2], t, u, &A[16], &A[24], A, &A[8]); + STEP8_IF_19(d_cw2[3], u, r, &A[8], &A[16], &A[24], A); + STEP8_MAJ_20(d_cw2[4], r, s, A, &A[8], &A[16], &A[24]); + STEP8_MAJ_21(d_cw2[5], s, t, &A[24], A, &A[8], &A[16]); + STEP8_MAJ_22(d_cw2[6], t, u, &A[16], &A[24], A, &A[8]); + STEP8_MAJ_23(d_cw2[7], u, r, &A[8], &A[16], &A[24], A); +} +static __constant__ uint32_t d_cw3[8][8]; +static const uint32_t h_cw3[8][8] = { + 0x1234EDCC, 0xF5140AEC, 0xCDF1320F, 0x3DE4C21C, 0x48D0B730, 0x1234EDCC, 0x131DECE3, 0x52D3AD2D, + 0xE684197C, 0x6D3892C8, 0x72AE8D52, 0x6FF3900D, 0x73978C69, 0xEB1114EF, 0x15D8EA28, 0x71C58E3B, + 0x90F66F0A, 0x15D8EA28, 0x9BE2641E, 0x65F09A10, 0xEA2815D8, 0xBD8F4271, 0x3A40C5C0, 0xD9C6263A, + 0xB38C4C74, 0xBAD4452C, 0x70DC8F24, 0xAB5B54A5, 0x46FEB902, 0x1A65E59B, 0x0DA7F259, 0xA32A5CD6, + 0xD62229DE, 0xB81947E7, 0x6D3892C8, 0x15D8EA28, 0xE59B1A65, 0x065FF9A1, 0xB2A34D5D, 0x6A7D9583, + 0x975568AB, 0xFC5C03A4, 0x2E6BD195, 0x966C6994, 0xF2590DA7, 0x263AD9C6, 0x5A1BA5E5, 0xB0D14F2F, + 0x975568AB, 0x6994966C, 0xF1700E90, 0xD3672C99, 0xCC1F33E1, 0xFC5C03A4, 0x452CBAD4, 0x4E46B1BA, + 0xF1700E90, 0xB2A34D5D, 0xD0AC2F54, 0x5760A8A0, 0x8C697397, 0x624C9DB4, 0xE85617AA, 0x95836A7D +}; +__device__ __forceinline__ void Round8_3_final(uint32_t *A, + int r, int s, int t, int u) { + + + STEP8_IF_24(d_cw3[0], r, s, A, &A[8], &A[16], &A[24]); + STEP8_IF_25(d_cw3[1], s, t, &A[24], A, &A[8], &A[16]); + STEP8_IF_26(d_cw3[2], t, u, &A[16], &A[24], A, &A[8]); + STEP8_IF_27(d_cw3[3], u, r, &A[8], &A[16], &A[24], A); + STEP8_MAJ_28(d_cw3[4], r, s, A, &A[8], &A[16], &A[24]); + STEP8_MAJ_29(d_cw3[5], s, t, &A[24], A, &A[8], &A[16]); + STEP8_MAJ_30(d_cw3[6], t, u, &A[16], &A[24], A, &A[8]); + STEP8_MAJ_31(d_cw3[7], u, r, &A[8], &A[16], &A[24], A); +} + +#if __CUDA_ARCH__ < 350 +#define expanded_vector(x) tex1Dfetch(texRef1D_128, (x)) +#else +//#define expanded_vector(x) tex1Dfetch(texRef1D_128, (x)) +#define expanded_vector(x) __ldg(&g_fft4[x]) +#endif + +__device__ __forceinline__ void Round8_0(uint32_t *A, const int thr_offset, + int r, int s, int t, int u, uint4 *g_fft4) { + uint32_t w[8]; + uint4 hv1, hv2; + + int tmp = 0 + thr_offset; + hv1 = expanded_vector(tmp++); w[0] = hv1.x; w[1] = hv1.y; w[2] = hv1.z; w[3] = hv1.w; + hv2 = expanded_vector(tmp++); w[4] = hv2.x; w[5] = hv2.y; w[6] = hv2.z; w[7] = hv2.w; + STEP8_IF_0(w, r, s, A, &A[8], &A[16], &A[24]); + hv1 = expanded_vector(tmp++); w[0] = hv1.x; w[1] = hv1.y; w[2] = hv1.z; w[3] = hv1.w; + hv2 = expanded_vector(tmp++); w[4] = hv2.x; w[5] = hv2.y; w[6] = hv2.z; w[7] = hv2.w; + STEP8_IF_1(w, s, t, &A[24], A, &A[8], &A[16]); + hv1 = expanded_vector(tmp++); w[0] = hv1.x; w[1] = hv1.y; w[2] = hv1.z; w[3] = hv1.w; + hv2 = expanded_vector(tmp++); w[4] = hv2.x; w[5] = hv2.y; w[6] = hv2.z; w[7] = hv2.w; + STEP8_IF_2(w, t, u, &A[16], &A[24], A, &A[8]); + hv1 = expanded_vector(tmp++); w[0] = hv1.x; w[1] = hv1.y; w[2] = hv1.z; w[3] = hv1.w; + hv2 = expanded_vector(tmp++); w[4] = hv2.x; w[5] = hv2.y; w[6] = hv2.z; w[7] = hv2.w; + STEP8_IF_3(w, u, r, &A[8], &A[16], &A[24], A); + hv1 = expanded_vector(tmp++); w[0] = hv1.x; w[1] = hv1.y; w[2] = hv1.z; w[3] = hv1.w; + hv2 = expanded_vector(tmp++); w[4] = hv2.x; w[5] = hv2.y; w[6] = hv2.z; w[7] = hv2.w; + STEP8_MAJ_4(w, r, s, A, &A[8], &A[16], &A[24]); + hv1 = expanded_vector(tmp++); w[0] = hv1.x; w[1] = hv1.y; w[2] = hv1.z; w[3] = hv1.w; + hv2 = expanded_vector(tmp++); w[4] = hv2.x; w[5] = hv2.y; w[6] = hv2.z; w[7] = hv2.w; + STEP8_MAJ_5(w, s, t, &A[24], A, &A[8], &A[16]); + hv1 = expanded_vector(tmp++); w[0] = hv1.x; w[1] = hv1.y; w[2] = hv1.z; w[3] = hv1.w; + hv2 = expanded_vector(tmp++); w[4] = hv2.x; w[5] = hv2.y; w[6] = hv2.z; w[7] = hv2.w; + STEP8_MAJ_6(w, t, u, &A[16], &A[24], A, &A[8]); + hv1 = expanded_vector(tmp++); w[0] = hv1.x; w[1] = hv1.y; w[2] = hv1.z; w[3] = hv1.w; + hv2 = expanded_vector(tmp++); w[4] = hv2.x; w[5] = hv2.y; w[6] = hv2.z; w[7] = hv2.w; + STEP8_MAJ_7(w, u, r, &A[8], &A[16], &A[24], A); + + +} +__device__ __forceinline__ void Round8_1(uint32_t *A, const int thr_offset, + int r, int s, int t, int u, uint4 *g_fft4) { + uint32_t w[8]; + uint4 hv1, hv2; + + int tmp = 16 + thr_offset; + hv1 = expanded_vector(tmp++); w[0] = hv1.x; w[1] = hv1.y; w[2] = hv1.z; w[3] = hv1.w; + hv2 = expanded_vector(tmp++); w[4] = hv2.x; w[5] = hv2.y; w[6] = hv2.z; w[7] = hv2.w; + STEP8_IF_8(w, r, s, A, &A[8], &A[16], &A[24]); + hv1 = expanded_vector(tmp++); w[0] = hv1.x; w[1] = hv1.y; w[2] = hv1.z; w[3] = hv1.w; + hv2 = expanded_vector(tmp++); w[4] = hv2.x; w[5] = hv2.y; w[6] = hv2.z; w[7] = hv2.w; + STEP8_IF_9(w, s, t, &A[24], A, &A[8], &A[16]); + hv1 = expanded_vector(tmp++); w[0] = hv1.x; w[1] = hv1.y; w[2] = hv1.z; w[3] = hv1.w; + hv2 = expanded_vector(tmp++); w[4] = hv2.x; w[5] = hv2.y; w[6] = hv2.z; w[7] = hv2.w; + STEP8_IF_10(w, t, u, &A[16], &A[24], A, &A[8]); + hv1 = expanded_vector(tmp++); w[0] = hv1.x; w[1] = hv1.y; w[2] = hv1.z; w[3] = hv1.w; + hv2 = expanded_vector(tmp++); w[4] = hv2.x; w[5] = hv2.y; w[6] = hv2.z; w[7] = hv2.w; + STEP8_IF_11(w, u, r, &A[8], &A[16], &A[24], A); + hv1 = expanded_vector(tmp++); w[0] = hv1.x; w[1] = hv1.y; w[2] = hv1.z; w[3] = hv1.w; + hv2 = expanded_vector(tmp++); w[4] = hv2.x; w[5] = hv2.y; w[6] = hv2.z; w[7] = hv2.w; + STEP8_MAJ_12(w, r, s, A, &A[8], &A[16], &A[24]); + hv1 = expanded_vector(tmp++); w[0] = hv1.x; w[1] = hv1.y; w[2] = hv1.z; w[3] = hv1.w; + hv2 = expanded_vector(tmp++); w[4] = hv2.x; w[5] = hv2.y; w[6] = hv2.z; w[7] = hv2.w; + STEP8_MAJ_13(w, s, t, &A[24], A, &A[8], &A[16]); + hv1 = expanded_vector(tmp++); w[0] = hv1.x; w[1] = hv1.y; w[2] = hv1.z; w[3] = hv1.w; + hv2 = expanded_vector(tmp++); w[4] = hv2.x; w[5] = hv2.y; w[6] = hv2.z; w[7] = hv2.w; + STEP8_MAJ_14(w, t, u, &A[16], &A[24], A, &A[8]); + hv1 = expanded_vector(tmp++); w[0] = hv1.x; w[1] = hv1.y; w[2] = hv1.z; w[3] = hv1.w; + hv2 = expanded_vector(tmp++); w[4] = hv2.x; w[5] = hv2.y; w[6] = hv2.z; w[7] = hv2.w; + STEP8_MAJ_15(w, u, r, &A[8], &A[16], &A[24], A); + + +} +__device__ __forceinline__ void Round8_2(uint32_t *A, const int thr_offset, + int r, int s, int t, int u, uint4 *g_fft4) { + uint32_t w[8]; + uint4 hv1, hv2; + + int tmp = 32 + thr_offset; + hv1 = expanded_vector(tmp++); w[0] = hv1.x; w[1] = hv1.y; w[2] = hv1.z; w[3] = hv1.w; + hv2 = expanded_vector(tmp++); w[4] = hv2.x; w[5] = hv2.y; w[6] = hv2.z; w[7] = hv2.w; + STEP8_IF_16(w, r, s, A, &A[8], &A[16], &A[24]); + hv1 = expanded_vector(tmp++); w[0] = hv1.x; w[1] = hv1.y; w[2] = hv1.z; w[3] = hv1.w; + hv2 = expanded_vector(tmp++); w[4] = hv2.x; w[5] = hv2.y; w[6] = hv2.z; w[7] = hv2.w; + STEP8_IF_17(w, s, t, &A[24], A, &A[8], &A[16]); + hv1 = expanded_vector(tmp++); w[0] = hv1.x; w[1] = hv1.y; w[2] = hv1.z; w[3] = hv1.w; + hv2 = expanded_vector(tmp++); w[4] = hv2.x; w[5] = hv2.y; w[6] = hv2.z; w[7] = hv2.w; + STEP8_IF_18(w, t, u, &A[16], &A[24], A, &A[8]); + hv1 = expanded_vector(tmp++); w[0] = hv1.x; w[1] = hv1.y; w[2] = hv1.z; w[3] = hv1.w; + hv2 = expanded_vector(tmp++); w[4] = hv2.x; w[5] = hv2.y; w[6] = hv2.z; w[7] = hv2.w; + STEP8_IF_19(w, u, r, &A[8], &A[16], &A[24], A); + hv1 = expanded_vector(tmp++); w[0] = hv1.x; w[1] = hv1.y; w[2] = hv1.z; w[3] = hv1.w; + hv2 = expanded_vector(tmp++); w[4] = hv2.x; w[5] = hv2.y; w[6] = hv2.z; w[7] = hv2.w; + STEP8_MAJ_20(w, r, s, A, &A[8], &A[16], &A[24]); + hv1 = expanded_vector(tmp++); w[0] = hv1.x; w[1] = hv1.y; w[2] = hv1.z; w[3] = hv1.w; + hv2 = expanded_vector(tmp++); w[4] = hv2.x; w[5] = hv2.y; w[6] = hv2.z; w[7] = hv2.w; + STEP8_MAJ_21(w, s, t, &A[24], A, &A[8], &A[16]); + hv1 = expanded_vector(tmp++); w[0] = hv1.x; w[1] = hv1.y; w[2] = hv1.z; w[3] = hv1.w; + hv2 = expanded_vector(tmp++); w[4] = hv2.x; w[5] = hv2.y; w[6] = hv2.z; w[7] = hv2.w; + STEP8_MAJ_22(w, t, u, &A[16], &A[24], A, &A[8]); + hv1 = expanded_vector(tmp++); w[0] = hv1.x; w[1] = hv1.y; w[2] = hv1.z; w[3] = hv1.w; + hv2 = expanded_vector(tmp++); w[4] = hv2.x; w[5] = hv2.y; w[6] = hv2.z; w[7] = hv2.w; + STEP8_MAJ_23(w, u, r, &A[8], &A[16], &A[24], A); + + +} +__device__ __forceinline__ void Round8_3(uint32_t *A, const int thr_offset, + int r, int s, int t, int u, uint4 *g_fft4) { + uint32_t w[8]; + uint4 hv1, hv2; + + int tmp = 48 + thr_offset; + hv1 = expanded_vector(tmp++); w[0] = hv1.x; w[1] = hv1.y; w[2] = hv1.z; w[3] = hv1.w; + hv2 = expanded_vector(tmp++); w[4] = hv2.x; w[5] = hv2.y; w[6] = hv2.z; w[7] = hv2.w; + STEP8_IF_24(w, r, s, A, &A[8], &A[16], &A[24]); + hv1 = expanded_vector(tmp++); w[0] = hv1.x; w[1] = hv1.y; w[2] = hv1.z; w[3] = hv1.w; + hv2 = expanded_vector(tmp++); w[4] = hv2.x; w[5] = hv2.y; w[6] = hv2.z; w[7] = hv2.w; + STEP8_IF_25(w, s, t, &A[24], A, &A[8], &A[16]); + hv1 = expanded_vector(tmp++); w[0] = hv1.x; w[1] = hv1.y; w[2] = hv1.z; w[3] = hv1.w; + hv2 = expanded_vector(tmp++); w[4] = hv2.x; w[5] = hv2.y; w[6] = hv2.z; w[7] = hv2.w; + STEP8_IF_26(w, t, u, &A[16], &A[24], A, &A[8]); + hv1 = expanded_vector(tmp++); w[0] = hv1.x; w[1] = hv1.y; w[2] = hv1.z; w[3] = hv1.w; + hv2 = expanded_vector(tmp++); w[4] = hv2.x; w[5] = hv2.y; w[6] = hv2.z; w[7] = hv2.w; + STEP8_IF_27(w, u, r, &A[8], &A[16], &A[24], A); + hv1 = expanded_vector(tmp++); w[0] = hv1.x; w[1] = hv1.y; w[2] = hv1.z; w[3] = hv1.w; + hv2 = expanded_vector(tmp++); w[4] = hv2.x; w[5] = hv2.y; w[6] = hv2.z; w[7] = hv2.w; + STEP8_MAJ_28(w, r, s, A, &A[8], &A[16], &A[24]); + hv1 = expanded_vector(tmp++); w[0] = hv1.x; w[1] = hv1.y; w[2] = hv1.z; w[3] = hv1.w; + hv2 = expanded_vector(tmp++); w[4] = hv2.x; w[5] = hv2.y; w[6] = hv2.z; w[7] = hv2.w; + STEP8_MAJ_29(w, s, t, &A[24], A, &A[8], &A[16]); + hv1 = expanded_vector(tmp++); w[0] = hv1.x; w[1] = hv1.y; w[2] = hv1.z; w[3] = hv1.w; + hv2 = expanded_vector(tmp++); w[4] = hv2.x; w[5] = hv2.y; w[6] = hv2.z; w[7] = hv2.w; + STEP8_MAJ_30(w, t, u, &A[16], &A[24], A, &A[8]); + hv1 = expanded_vector(tmp++); w[0] = hv1.x; w[1] = hv1.y; w[2] = hv1.z; w[3] = hv1.w; + hv2 = expanded_vector(tmp++); w[4] = hv2.x; w[5] = hv2.y; w[6] = hv2.z; w[7] = hv2.w; + STEP8_MAJ_31(w, u, r, &A[8], &A[16], &A[24], A); + + +} + +__device__ __forceinline__ void SIMD_Compress1(uint32_t *A, const int thr_id, const uint32_t *M, uint4 *g_fft4) { + int i; + const int thr_offset = thr_id << 6; // thr_id * 128 (je zwei elemente) +#pragma unroll 8 + for(i=0; i<8; i++) { + A[i] ^= M[i]; + (&A[8])[i] ^= M[8+i]; + } + Round8_0(A, thr_offset, 3, 23, 17, 27, g_fft4); + Round8_1(A, thr_offset, 28, 19, 22, 7, g_fft4); +} + +__device__ __forceinline__ void Compression1(const uint32_t *hashval, const int texture_id, uint4 *g_fft4, int *g_state) { + uint32_t A[32]; + int i; +#pragma unroll 32 + for (i=0; i < 32; i++) A[i] = c_IV_512[i]; + uint32_t buffer[16]; +#pragma unroll 16 + for (i=0; i < 16; i++) buffer[i] = hashval[i]; + SIMD_Compress1(A, texture_id, buffer, g_fft4); + uint32_t *state = (uint32_t*)&g_state[blockIdx.x * (blockDim.x*32)]; +#pragma unroll 32 + for (i=0; i < 32; i++) state[threadIdx.x+blockDim.x*i] = A[i]; +} + +__device__ __forceinline__ void SIMD_Compress2(uint32_t *A, const int thr_id, uint4 *g_fft4) { + uint32_t IV[4][8]; + int i; + const int thr_offset = thr_id << 6; // thr_id * 128 (je zwei elemente) +#pragma unroll 8 + for(i=0; i<8; i++) { + IV[0][i] = c_IV_512[i]; + IV[1][i] = c_IV_512[8+i]; + IV[2][i] = c_IV_512[16+i]; + IV[3][i] = c_IV_512[24+i]; + } + Round8_2(A, thr_offset, 29, 9, 15, 5, g_fft4); + Round8_3(A, thr_offset, 4, 13, 10, 25, g_fft4); + STEP8_IF_32(IV[0], 4, 13, A, &A[8], &A[16], &A[24]); + STEP8_IF_33(IV[1], 13, 10, &A[24], A, &A[8], &A[16]); + STEP8_IF_34(IV[2], 10, 25, &A[16], &A[24], A, &A[8]); + STEP8_IF_35(IV[3], 25, 4, &A[8], &A[16], &A[24], A); +} + +__device__ __forceinline__ void Compression2(const int texture_id, uint4 *g_fft4, int *g_state) { + uint32_t A[32]; + int i; + uint32_t *state = (uint32_t*)&g_state[blockIdx.x * (blockDim.x*32)]; +#pragma unroll 32 + for (i=0; i < 32; i++) A[i] = state[threadIdx.x+blockDim.x*i]; + SIMD_Compress2(A, texture_id, g_fft4); +#pragma unroll 32 + for (i=0; i < 32; i++) state[threadIdx.x+blockDim.x*i] = A[i]; +} + +__device__ __forceinline__ void SIMD_Compress_Final(uint32_t *A, const uint32_t *M) { + uint32_t IV[4][8]; + int i; +#pragma unroll 8 + for(i=0; i<8; i++) { + IV[0][i] = A[i]; + IV[1][i] = (&A[8])[i]; + IV[2][i] = (&A[16])[i]; + IV[3][i] = (&A[24])[i]; + } +#pragma unroll 8 + for(i=0; i<8; i++) { + A[i] ^= M[i]; + (&A[8])[i] ^= M[8+i]; + } + Round8_0_final(A, 3, 23, 17, 27); + Round8_1_final(A, 28, 19, 22, 7); + Round8_2_final(A, 29, 9, 15, 5); + Round8_3_final(A, 4, 13, 10, 25); + STEP8_IF_32(IV[0], 4, 13, A, &A[8], &A[16], &A[24]); + STEP8_IF_33(IV[1], 13, 10, &A[24], A, &A[8], &A[16]); + STEP8_IF_34(IV[2], 10, 25, &A[16], &A[24], A, &A[8]); + STEP8_IF_35(IV[3], 25, 4, &A[8], &A[16], &A[24], A); +} + +__device__ __forceinline__ void Final(uint32_t *hashval, const int texture_id, uint4 *g_fft4, int *g_state) { + uint32_t A[32]; + int i; + uint32_t *state = (uint32_t*)&g_state[blockIdx.x * (blockDim.x*32)]; +#pragma unroll 32 + for (i=0; i < 32; i++) A[i] = state[threadIdx.x+blockDim.x*i]; + uint32_t buffer[16]; + buffer[0] = 512; +#pragma unroll 15 + for (i=1; i < 16; i++) buffer[i] = 0; + SIMD_Compress_Final(A, buffer); +#pragma unroll 16 + for (i=0; i < 16; i++) + hashval[i] = A[i]; +} diff --git a/x11/x11.cu b/x11/x11.cu index f5382ea865..77b6a72358 100644 --- a/x11/x11.cu +++ b/x11/x11.cu @@ -163,12 +163,14 @@ extern "C" int scanhash_x11(int thr_id, uint32_t *pdata, const uint32_t first_nonce = pdata[19]; // TODO: entfernen für eine Release! Ist nur zum Testen! - if (opt_benchmark) + if (opt_benchmark) { ((uint32_t*)ptarget)[7] = 0x0000ff; + pdata[17] = 0; + } const uint32_t Htarg = ptarget[7]; - const int throughput = 256*256*16; + const int throughput = 256*256*8; static bool init[8] = {0,0,0,0,0,0,0,0}; if (!init[thr_id])